Prior to this script, MutationTimeR was run on 2x 2,778 VCF files for subs and indels each. This was done using the script VCF-annotate.R, attached at the end of this vignette.
Input files:
../final/final_consensus_12oct_passonly../final/consensus.20170119.somatic.cna.annotated../final/structure_weme_released_consensus_merged.txt, ../final/wcc_consensus_values_9_12.tsv../final/consensus.20170218.purity.ploidy.txtOutput:
cat(paste0(system("for d in `find ../final/annotated_014 -maxdepth 1 -type d`
do echo $d; ls $d | head -6;
done", ignore.stderr=TRUE, intern=TRUE), collapse="\n"))
## ../final/annotated_014
## clusters
## cn
## graylist
## indel
## other
## snv_mnv
## ../final/annotated_014/snv_mnv
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.RData
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.bgz
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.RData
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.bgz
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.RData
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.complete_annotation.vcf.bgz
## ../final/annotated_014/graylist
## clusters
## cn
## indel
## other
## snv_mnv
## ../final/annotated_014/other
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## 00493087-9d9d-40ca-86d5-936f1b951c93.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## 00508f2b-36bf-44fc-b66b-97e1f3e40bfa.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## 005794f1-5a87-45b5-9811-83ddf6924568.consensus.20160830.somatic.snv_mnv.complete_annotation.pdf
## ../final/annotated_014/indel
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20161006.somatic.indel.complete_annotation.vcf.RData
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20161006.somatic.indel.complete_annotation.vcf.bgz
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20161006.somatic.indel.complete_annotation.vcf.RData
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20161006.somatic.indel.complete_annotation.vcf.bgz
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20161006.somatic.indel.complete_annotation.vcf.RData
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20161006.somatic.indel.complete_annotation.vcf.bgz
## ../final/annotated_014/clusters
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## 00493087-9d9d-40ca-86d5-936f1b951c93.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## 00508f2b-36bf-44fc-b66b-97e1f3e40bfa.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## 005794f1-5a87-45b5-9811-83ddf6924568.consensus.20160830.somatic.snv_mnv.complete_annotation.clusters_purity.RData
## ../final/annotated_014/cn
## 0009b464-b376-4fbc-8a56-da538269a02f.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
## 003819bc-c415-4e76-887c-931d60ed39e7.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
## 0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
## 00493087-9d9d-40ca-86d5-936f1b951c93.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
## 00508f2b-36bf-44fc-b66b-97e1f3e40bfa.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
## 005794f1-5a87-45b5-9811-83ddf6924568.consensus.20160830.somatic.snv_mnv.complete_annotation.bb_granges.RData
Annotated VCF files and copy number segments with timing information will be loaded as shown below. This requires about 100G memory.
Load convenience R functions (code shown at the end of this vignette).
source("PCAWG-functions.R")
## Warning in eval(ei, envir): NAs introduced by coercion
Prepare spreadsheets for figure data
library(xlsx)
Figure1 <- createWorkbook()
Figure2 <- createWorkbook()
Figure4 <- createWorkbook()
Figure5 <- createWorkbook()
ExtendedDataFigure3 <- createWorkbook()
ExtendedDataFigure6 <- createWorkbook()
ExtendedDataFigure8 <- createWorkbook()
ExtendedDataFigure9 <- createWorkbook()
## Warning in eval(ei, envir): NAs introduced by coercion
First 2,703 whitelisted samples
Load annotated VCF files for SNVs and MNVs into a list with VariantAnnotation::VCF() objects:
p <- "../final/annotated_014/snv_mnv"
d <- dir(p, pattern="*.vcf.RData", full.names=TRUE)
finalSnv <- unlist(mclapply(split(d, seq_along(d) %/% 100), lapply, function(f) { # read in batches of 100
e <- new.env()
load(f, envir=e)
e$vcf
}, mc.preschedule=FALSE), recursive=FALSE)
names(finalSnv) <- sub(".conse.+","",dir(p, pattern="*.vcf.RData", full.names=FALSE))
Load copynumber profiles as a list of GRanges():
p <- "../final/annotated_014/cn"
finalBB <- list()
for( f in dir(p, pattern="*.bb_granges.RData", full.names=TRUE)){
load(f)
colnames(mcols(bb)) <- sub("star.1","time.star",colnames(mcols(bb)) ) # Fix naming problem
finalBB[[f]] <- bb
}
names(finalBB) <- sub(".conse.+","",dir(p, pattern="*.bb_granges.RData", full.names=FALSE))
Load annotated VCF files for indels into a list with VariantAnnotation::VCF() objects:
p <- "../final/annotated_014/indel"
d <- dir(p, pattern="*.vcf.RData", full.names=TRUE)
finalIndel <- unlist(mclapply(split(d, seq_along(d) %/% 100), lapply, function(f) { # read in batches of 100
e <- new.env()
load(f, envir=e)
e$vcfIndel
}, mc.preschedule=FALSE), recursive=FALSE)
names(finalIndel) <- sub(".conse.+","",dir(p, pattern="*.vcf.RData", full.names=FALSE))
These were input to MutationTimeR. Loaded as a list() (clusters) and data.frame (purity/ploidy).
finalClusters <- list()
finalPurity <- numeric()
p <- "../final/annotated_014/clusters"
for( f in dir(p, pattern="*.RData", full.names=TRUE)){
load(f)
finalClusters[[f]] <- clusters
finalPurity[f] <- purity
}
names(finalClusters) <- names(finalPurity) <- sub(".conse.+","",dir(p, pattern="*.RData", full.names=FALSE))
finalDrivers is a VariantAnnotation::VRanges() object with driver point mutations (substitutions and indels), as provided by the PCAWG drivers working group. Here, we match these data with annotation from MutationTimeR. To this end, match samples and positions and copy annotation.
finalDriversAnnotated <- finalDrivers
d <- info(finalSnv[[3]])[seq_along(finalDriversAnnotated),19:32]
#d[,] <- NA
mcols(finalDriversAnnotated)[colnames(d)] <- d
for(i in seq_along(finalDriversAnnotated)){
if(finalDriversAnnotated$mut_type[i] %in% c("snv","mnv")){
v <- finalSnv[[as.character(finalDriversAnnotated$sample[i])]]
}else{
v <- finalIndel[[as.character(finalDriversAnnotated$sample[i])]]
}
j <- findOverlaps(finalDriversAnnotated[i], v, select='first')
if(!is.na(j)){
mcols(finalDriversAnnotated)[i,colnames(d)] <- info(v)[j, colnames(d)]
refDepth(finalDriversAnnotated)[i] <- info(v)[j,"t_ref_count"]
altDepth(finalDriversAnnotated)[i] <- info(v)[j,"t_alt_count"]
}
else
mcols(finalDriversAnnotated)[i,colnames(d)] <- NA
}
75 graylisted samples were processed separately using MutationTimeR. We load these just like the whitelisted samples and concatenate the output.
p <- "../final/annotated_014/graylist/snv_mnv"
finalSnvGray <- mclapply(dir(p, pattern="*.vcf.RData", full.names=TRUE), function(f) {
e <- new.env()
load(f, envir=e)
e$vcf
})
names(finalSnvGray) <- sub(".conse.+","",dir(p, pattern="*.vcf.RData", full.names=FALSE))
finalSnv[names(finalSnvGray)] <- finalSnvGray
p <- "../final/annotated_014/graylist/cn"
finalBBGray <- list()
for( f in dir(p, pattern="*.bb_granges.RData", full.names=TRUE)){
load(f)
colnames(mcols(bb)) <- sub("star.1","time.star",colnames(mcols(bb)) ) # Fix naming problem
finalBBGray[[f]] <- bb
}
names(finalBBGray) <- sub(".conse.+","",dir(p, pattern="*.bb_granges.RData", full.names=FALSE))
finalBB[names(finalBBGray)] <- finalBBGray
p <- "../final/annotated_014/graylist/indel"
finalIndelGray <- mclapply(dir(p, pattern="*.vcf.RData", full.names=TRUE), function(f) {
e <- new.env()
load(f, envir=e)
e$vcfIndel
})
names(finalIndelGray) <- sub(".conse.+","",dir(p, pattern="*.vcf.RData", full.names=FALSE))
finalIndel[names(finalIndelGray)] <- finalIndelGray
finalClustersGray <- list()
finalPurityGray <- numeric()
p <- "../final/annotated_014/graylist/clusters"
for( f in dir(p, pattern="*.RData", full.names=TRUE)){
load(f)
finalClustersGray[[f]] <- clusters
finalPurityGray[f] <- purity
}
names(finalClustersGray) <- names(finalPurityGray) <- sub(".conse.+","",dir(p, pattern="*.RData", full.names=FALSE))
finalClusters[names(finalClustersGray)] <- finalClustersGray
finalPurity <- c(finalPurity, finalPurityGray)
whiteList <- seq_along(finalSnv) %in% 1:2703
grayList <- !whiteList
Lastly, we load structural variant data. This is only available for a subset. We pad missing samples with NA
finalSv <- mclapply(dir("../final/pcawg_consensus_1.6.161116.somatic_svs", pattern='*.vcf.gz$', full.names=TRUE), function(x) {
t <- try(readVcf(x))
return(t)
})
names(finalSv) <- sub("../final/pcawg_consensus_1.6.161116.somatic_svs/","", sub(".pcawg_consensus_1.6.161116.somatic.sv.vcf.gz","",dir("../final/pcawg_consensus_1.6.161116.somatic_svs", pattern='*.vcf.gz$', full.names=TRUE)))
finalSv <- finalSv[names(finalSnv)]
Here we calculate basic QC for MutationTimeR output. info(vcf)$pMutCNTail contains the beta-binomial tail probability pbetabinom for every variant for its most likely clonal state. These should be roughly uniform, hence 1% outside [0.005,0.995] and about 5% outside [0.025,0.975]. Larger deviations indicate that either purity, copy number or clusters were wrong in a given sample.
q1 <- sapply(finalSnv, function(vcf) mean(abs(0.5- info(vcf)$pMutCNTail) > 0.495 , na.rm=TRUE))
q5 <- sapply(finalSnv, function(vcf) mean(abs(0.5- info(vcf)$pMutCNTail) > 0.475 , na.rm=TRUE))
Most samples agree, on average:
par(mfrow=c(1,1))
boxplot(1-q5 ~ donor2type[sample2donor[names(finalSnv)]], las=2, ylab="Fraction of data inside theoretical 95% CI")
abline(h=0.95, lty=3)
QQ-plots show few outliers per sample:
#pdf("QQplots.pdf", 4,4, pointsize=8)
par(mfrow=c(5,5))
for(i in seq_along(finalSnv)[1:25]){
n <- nrow(finalSnv[[i]])
qqnorm(qnorm(info(finalSnv[[i]])$pMutCNTail[sample(1:n, min(1e4,n))]), main=paste(substr(names(finalSnv)[i],1,8), "Q5 =", signif(q5[i],2), ", Q1 =", signif(q1[i],2)), xlim=c(-5,5), ylim=c(-5,5), pch=16)
abline(0,1, col='red')
}
#dev.off()
Here we tabulate all driver point mutations and their clonal allele status.
These are the maximum a posteriori (MAP) assignments, used for the early/late/clonal/subclonal annotation output of MutationTimeR.
finalGenotypesSnv <- simplify2array(mclapply(finalSnv[whiteList], getGenotype, useNA="always"))
finalGenotypesIndel <- simplify2array(mclapply(finalIndel[whiteList], getGenotype, useNA="always"))
finalGenotypes <- aperm(abind::abind(subs=finalGenotypesSnv,indels=finalGenotypesIndel, along=5), c(1,5,2,3,4))
rm(finalGenotypesSnv,finalGenotypesIndel)
Alternatively to the hard MAP assignments one can calculate the actual probailities per state and assess their overall distribution. This could avoid some biases of the MAP estimates, which are typically biased towards the more populous clonal states.
finalGenotypesSnvP <- simplify2array(mclapply(finalSnv[whiteList], probGenotype))
finalGenotypesIndelP <- simplify2array(mclapply(finalIndel[whiteList], probGenotype))
finalGenotypesP <- aperm(abind::abind(subs=finalGenotypesSnvP,indels=finalGenotypesIndelP, along=4), c(1,4,2,3))
rm(finalGenotypesSnvP,finalGenotypesIndelP)
As QC, we aggregate also the tail probabilities to find out whether there were annotation issues for some driver genes
finalGenotypesSnvQ <- simplify2array(mclapply(finalSnv[whiteList], probGenotypeTail))
finalGenotypesIndelQ <- simplify2array(mclapply(finalIndel[whiteList], probGenotypeTail))
finalGenotypesQ <- aperm(abind::abind(subs=finalGenotypesSnvQ,indels=finalGenotypesIndelQ, along=3), c(1,3,2))
rm(finalGenotypesSnvQ,finalGenotypesIndelQ)
As the previous steps take rather long, introduce a checkpoint here and serialise the data, so it can be loaded later on.
save.image(file=dumpfile, compress=FALSE)
save(finalGenotypes, finalGenotypesP, finalGenotypesQ, file=paste0(Sys.Date(),"-FinalGenotypes.RData"))
Here we assess the timing of point mutations, including driver gene mutations.
First, find samples with the same donor to avoid double dipping for some computations.
w <- names(finalSnv)
n <- names(which(table(sample2donor[w]) > 1)) # donors
s <- w[w %in% names(sample2donor[sample2donor %in% n])] # multisamples
d <- setdiff(sample2donor[w], sample2donor[levels(finalDrivers$sample)]) # donors w/o driver
u <- sample2donor[s[sample2donor[s] %in% intersect(d,n)]]
selectedSamples <- !w %in% setdiff(s[!s %in% finalDrivers$sample ], names(u)[!duplicated(u)])
uniqueSamples <- !duplicated(sample2donor[names(finalSnv)])
First assess the overall distributions of early/late/clonal/subclonal variants per sample, separately for subs and indels.
f <- function(x) unlist(sapply(seq_along(x), function(i) rep(i, x[i])))
d <- t(asum(finalGenotypesP[,"subs",,], 1))
o <- order(droplevels(donor2type[sample2donor[rownames(d)]]), -d[,1]/rowSums(d))
I <- t(apply(d/rowSums(d), 1, function(x) f(mg14:::roundProp(x * 100,p=100))))
d <- t(asum(finalGenotypesP[,"indels",,], 1))
J <- t(apply(d/rowSums(d), 1, function(x) if(!any(is.nan(x))) f(mg14:::roundProp(x * 100,p=100)) else rep(NA,100)))
s <- cumsum(table(droplevels(donor2type[sample2donor[rownames(d)]][o])))
col <- RColorBrewer::brewer.pal(9, "Set1")[c(3,4,2,1,9)] ## Colors for early-subclonal
par(fig=c(0,1,0,1),mar=c(1,4,1,1)+.1, mgp=c(2,.5,0), mfrow=c(3,1), bty="n", las=2, xpd=FALSE)
image(z=I[o,], x=1:nrow(I), useRaster=TRUE, col=col, xaxt="n", ylab="SNVs")
abline(v=s, col='white')
image(z=J[o,], x=1:nrow(I),useRaster=TRUE, col=col, xaxt="n", ylab="Indels")
abline(v=s, col='white')
par(bty="n", xpd=NA)
plot(NA,NA, xaxt="n", yaxt="n", xlab="", ylab="", xlim=c(0, nrow(J)), ylim=c(0,1), xaxs="i", yaxs='i')
d0 <- s - diff(c(0,s))/2
d1 <- mg14:::mindist(d0,30)
segments(d0, 1.12, d0, 1.2)
segments(d0, 1.12, d1, 1.08)
segments(d1, 1.08, d1, 1)
mg14::rotatedLabel(x0 = d1, names(s), y0=1)
legend("bottom", fill=col, legend=paste(dimnames(finalGenotypes)[[3]]), bty="n", horiz=TRUE, title="Mutation timing")
The proportions of subs and indels are remarkably similar.
For the manuscript only plot subs, together with the color code for each tumour type.
f <- function(x) unlist(sapply(seq_along(x), function(i) rep(i, x[i])))
d <- t(sapply(names(finalSnv)[whiteList & selectedSamples], function(n) table(info(finalSnv[[n]])$CLS, useNA='a') + table(info(finalIndel[[n]])$CLS, useNA='a')))
o <- order(droplevels(donor2type[sample2donor[rownames(d)]]), -d[,1]/rowSums(d))
I <- t(apply(d/rowSums(d), 1, function(x) f(mg14:::roundProp(x * 100,p=100))))
s <- cumsum(table(droplevels(donor2type[sample2donor[rownames(d)]][o])))
col <- RColorBrewer::brewer.pal(9, "Set1")[c(3,4,2,1,9)] ## Colors for early-subclonal
par(fig=c(0,1,0,1),mar=rep(0,4), bty="n", tcl=-0.25)
layout(matrix(1:5,nrow=5), height=c(1,4,1.2,0.8,4))
par(cex=1)
plot(NA,NA, xaxt="n", yaxt="n", xlab="", ylab="", xlim=c(0, nrow(I)), ylim=c(0,1), xaxs="i", yaxs='i')
legend("bottom", fill=col, legend=paste(dimnames(finalGenotypes)[[3]]), bty="n", horiz=TRUE, title="Mutation timing", cex=1)
par(mar=c(0,4,0,0)+.1, mgp=c(2,.5,0), bty="n", las=2, xpd=FALSE, cex=1)
image(z=I[o,], x=1:nrow(I), useRaster=TRUE, col=col, xaxt="n", ylab="Point mutations")
abline(v=s, col='white')
u <- par("usr")
par(bty="o")
barplot(t(t(table(droplevels(donor2type[sample2donor[rownames(d)]][o])))), horiz=TRUE, col=tissueColors[names(s)], border=NA, xlim=u[1:2], yaxs='i', xaxt="n")
par(bty="n", xpd=NA)
plot(NA,NA, xaxt="n", yaxt="n", xlab="", ylab="", xlim=c(0, nrow(I)), ylim=c(0,1), xaxs="i", yaxs='i')
d0 <- s - diff(c(0,s))/2
d1 <- mg14:::mindist(d0,60)
segments(d0, 0.66, d0, 1)
segments(d0, 0.66, d1, 0.33)
segments(d1, 0.33, d1, 0)
mg14::rotatedLabel(x0 = d1, names(s), y0=0)
Also dump the xlsx output.
Figure2a <- createSheet(Figure2, "Figure2a")
addDataFrame(data.frame(d,cancer=droplevels(donor2type[sample2donor[rownames(d)]]))[o,c(6,1:5)], Figure2a)
#t <- 12/8
#dev.copy2pdf(file="finalMutationPropAll.pdf", width=9*t, height=1.8*t, pointsize=8*t)
Not turn to driver point mutations. Extract from the large arrays calculated above.
p <- asum(finalGenotypesP[,,,selectedSamples[whiteList]], c(2,4))
g <- asum(finalGenotypes[,,,,selectedSamples[whiteList]], c(2,4:5))
g <- g[order(rowSums(g), decreasing=TRUE),]
colnames(g) <- paste(colnames(g))
rownames(g) <- paste(rownames(g))
rownames(p) <- paste(rownames(p))
p <- p[rownames(g),]
w <- rowSums(g) > 0
w[1] <- FALSE
Collapse into unique genes
l <- sapply(strsplit(rownames(p),"::"), function(x) paste(x[2:3], collapse=":"))
pu <- t(sapply(unique(l), function(u) asum(p[l==u,,drop=FALSE], 1)))
gu <- t(sapply(unique(l), function(u) asum(g[l==u,,drop=FALSE], 1)))
gu <- gu[order(rowSums(gu), decreasing=TRUE),]
pu <- pu[rownames(gu),]
wu <- rowSums(gu) > 0
wu[1] <- FALSE
Now plot as a barplot. All drivers, with the top 50 as an inset.
par(fig=c(0,1,0,1),mar=c(1,4,1,1)+.1, mgp=c(3,.5,0))
barplot(t(gu[wu,]), col=col, las=2, legend=TRUE, args.legend=list("topright", bty='n'), ylab="Number of cases", names.arg=rep("",sum(wu)), border=NA)
#mg14::rotatedLabel(x=.Last.value,labels=rownames(g)[62:201], cex=.25)
u <- par("usr")
v <- c(
grconvertX(u[1:2], "user", "ndc"),
grconvertY(u[3:4], "user", "ndc")
)
v <- c( (v[1]+v[2])/3.33, v[2], (v[3]+v[4])/3, v[4] )
par( fig=v, new=TRUE, mar=c(0,0,0,0) )
b <- barplot(t(gu[2:51,]), col=col, las=2, names.arg=rep("",50))
mg14::rotatedLabel(x=b,labels=rownames(gu)[2:51], cex=.5)
#dev.copy2pdf(file="finalDrivers.pdf", width=9, height=5, pointsize=8)
Also generate xlsx output
Figure2a2 <- createSheet(Figure2, "Figure2a2")
addDataFrame(gu[wu,], Figure2a2)
gt <- asum(finalGenotypes[,,,,selectedSamples[whiteList]], c(2:4))
t <- droplevels(donor2type[sample2donor[colnames(gt)]])
rownames(gt) <- paste(rownames(gt))
gt <- gt[rownames(g),] %*% model.matrix(~t-1)
colnames(gt) <- levels(t)
gtu <- t(sapply(unique(l), function(u) asum(gt[l==u,,drop=FALSE], 1)))
gtu <- gtu[rownames(gu),]
Figure2a3 <- createSheet(Figure2, "Figure2a3")
addDataFrame(gtu[wu,], Figure2a3)
Proportions may be more revealing than absolute numbers:
par(fig=c(0,1,0,1),mar=c(3,4,1,1)+.1, mgp=c(3,.5,0))
w <- rowSums(pu) > 0
n <- 50
b <- barplot(t(pu /rowSums(pu))[,w], width=c(rep(2,n+1), rep(0.2,sum(w)-n-1)), space=c(0,2,rep(0.1,n), rep(0,sum(w)-n-2)), col=col, border=NA, ylab="Fraction of mutations", names.arg=rep("",sum(w)))
mg14::rotatedLabel(x=b[1:(n+1)],labels=c("Genome-wide", rownames(pu)[2:(n+1)]), cex=.5)
#s <- 12/8
#dev.copy2pdf(file="finalDriversProp.pdf", width=9*s, height=3*s, pointsize=8*s)
Sort driver mutations by prevalence in each timing class. Clearly the MAP estimate is biased downwards for driver genes not observed in a final cohort size. Hence to this without and with a pseudocount of one (which will overestimate). Report the average of the two as the point estimate.
tt <- abind::abind(pu[-1,], pu[-1,] + 0.5, along=3)
par(mar=c(3,3,1,1), mgp=c(2,.5,0), bty="L")
r <- array(apply(tt/rep(colSums(tt), each=nrow(tt)), 3, function(x) apply(x, 2, function(y) cumsum(sort(y, decreasing=TRUE)))), dim=dim(tt))
plot(cumsum(sort(r[,1,1], decreasing=TRUE)), col=NA, type='s', xlab="Number of different driver genes", ylab="Fraction of driver mutations", log='', ylim=c(0,1), xlim=c(0,664), xaxs='i', yaxs='i')
for(j in 1:4) {
polygon(c(1:nrow(r), nrow(r):1), c(r[,j,1], rev(r[,j,2])), col=paste0(col[j],"33"), border=NA)
lines((r[,j,1]+r[,j,2])/2, col=col[j], lwd=2)
points(y=0,x=which.min((r[,j,1]+r[,j,2])/2 < 0.5), col=col[j], pch="|")
}
legend("bottomright", col=col[1:4], lty=1, paste0(c("clonal [early]", "clonal [late]", "clonal [other]", "subclonal"), ", n=", round(colSums(p[-1,]))[-5]), bty="n")
abline(h=0.5, lty=3)
#dev.copy2pdf(file="finalGenesCumulative.pdf", width=4,height=4)
Now plot the value where the number of unique genes contributes 50% of all driver mutations.
par(mar=c(4,3,2.5,1), mgp=c(2,.5,0), bty="L")
d50 <- apply((r[,,1]+r[,,2])/2 < 0.5, 2, which.min)[c(1,3,2,4)]
b <- barplot(d50,las=2, col=col[c(1,3,2,4)], border=NA, ylab="Genes contributing 50% of driver mutations")
lo <- apply(r[,,1] < 0.5, 2, which.min)[c(1,3,2,4)]
hi <- apply(r[,,2] < 0.5, 2, which.min)[c(1,3,2,4)]
segments(b,lo,b,hi)
mg14::rotatedLabel(x=b,labels=c("clonal [early]", "clonal [late]", "clonal [other]", "subclonal")[c(1,3,2,4)])
#dev.copy2pdf(file="finalGenes50.pdf", width=3,height=4)
xlxs output
Figure2d <- createSheet(Figure2, "Figure2d")
addDataFrame(data.frame(d50, lo, hi, row.names=c("clonal [early]", "clonal [late]", "clonal [other]", "subclonal")[c(1,3,2,4)]), Figure2d)
Now turn to the timing of copy number alterations. First assess the whole genome duplications (WGD).
Final ploidy, weighted if subclonal CN
finalPloidy <- sapply(finalBB, averagePloidy)
names(finalPloidy) <- names(finalBB)
Final homozygousity, weighted if subclonal CN
finalHom <- sapply(finalBB, averageHom)
names(finalHom) <- names(finalBB)
WGD is primarily classified by the copy number data. Additionally we assess the concordance of timing.
Using the quantities above, we classify WGD samples. This is the official WG output.
isWgd <- .classWgd(finalPloidy, finalHom)
table(isWgd)
## isWgd
## FALSE TRUE
## 1960 818
plot(finalHom, finalPloidy, col=.classWgd( finalPloidy, finalHom)+1, xlim=c(0,1))
Further it’s worthwile assessing whether the different copy number segments show the same timing, which would be expected for WGD. We do this for both WGD and near diploid (ND) samples.
fracGenomeWgdComp <- t(sapply(finalBB, function(bb) {
fgw <- try(fractionGenomeWgdCompatible(bb));
if(class(fgw)!='try-error') fgw
else rep(NA,10)}))
rownames(fracGenomeWgdComp) <- names(finalBB)
The temporal concordance defines out rating. Some samples are uninformative (especially ND) either due to too little point mutations (large timing CIs), or too few gained segments.
wgdStar <- factor(rep(1,nrow(fracGenomeWgdComp)), levels=0:3, labels=c("unlikely","uninformative","likely","very likely"))
wgdStar[fracGenomeWgdComp[,"avg.ci"]<=0.75 & fracGenomeWgdComp[,"nt.total"]/chrOffset["MT"] >= 0.33 ] <- "likely"
wgdStar[fracGenomeWgdComp[,"nt.wgd"]/fracGenomeWgdComp[,"nt.total"] < 0.66] <- "unlikely"
wgdStar[wgdStar=="likely" & fracGenomeWgdComp[,"nt.wgd"]/fracGenomeWgdComp[,"nt.total"] > 0.8 & fracGenomeWgdComp[,"sd.wgd"] < 0.1 & fracGenomeWgdComp[,"nt.total"]/chrOffset["MT"] > 0.5] <- "very likely"
names(wgdStar) <- names(finalBB)
prop.table(table(wgdStar[!isWgd]))
##
## unlikely uninformative likely very likely
## 0.203061224 0.775510204 0.019387755 0.002040816
wgdPoss <- !isWgd & 2.5 - 1.5 * finalHom <= finalPloidy
wgdStat <- factor(wgdPoss + 2*isWgd - wgdPoss*isWgd, labels=c("absent","possible","present"))
table(wgdStat, wgdStar)
## wgdStar
## wgdStat unlikely uninformative likely very likely
## absent 384 1492 9 0
## possible 14 28 29 4
## present 68 1 573 176
Overall, WGD samples display a high level of temporal concordance, as hoped.
In this section we assess the temporal distribution of large-scale copy number gains.
This one aggregates individual segments by chromosome
aggregatePerChromosome <- function(bb, isWgd=FALSE){
.aggregateSegments <- function(m){
#m <- mcols(bb)
t <- weighted.mean(m$time, m$n.snv_mnv, na.rm=TRUE)
n <- sum(m$n.snv_mnv[!is.na(m$time)], na.rm=TRUE)
sd <- sd(m$time, na.rm=TRUE)
ci <- weighted.mean(m$time.up-m$time.lo, m$n.snv_mnv, na.rm=TRUE)
w <- sum(m$width[!is.na(m$time)], na.rm=TRUE)
c(time=t, n=n, sd=sd, ci=ci,w=w)
}
# if(!isWgd){
s <- split(as.data.frame(bb)[,c("time","time.up","time.lo","n.snv_mnv","width")], seqnames(bb))
r <- t(sapply(s, .aggregateSegments))
r <- r[c(1:22,"X"),]
# }else{
w <- .aggregateSegments(as.data.frame(bb))
r <- rbind(r,WGD=w)
# }
return(r)
}
This provides an average timing estimate per chromosome, per sample.
allChrAgg <- simplify2array(mclapply(finalBB, aggregatePerChromosome, mc.cores=2))
t <- allChrAgg[1:23,"time",!isWgd]
t[allChrAgg[1:23,"w",!isWgd] < diff(chrOffset)[1:23]*.33] <- NA
s <- split(as.data.frame(t(t)), droplevels(donor2type[sample2donor[names(finalSnv)]])[!isWgd])
n <- 10
at <- function(x, n){
if(sum(!is.na(x))<3) return(rep(sum(!is.na(x))/n,n))
bw=if(sum(!is.na(x))< 6) 0.5 else "nrd0"
d <- density(x, n=n, from=1/n/2, to=1-1/n/2, bw=bw, na.rm=TRUE)
d$y/sum(d$y)*d$n
}
allChrCancerHist <- sapply(s, apply, 2, at, n=n, simplify="array")
u <- split(data.frame(WGD=allChrAgg["WGD","time",isWgd]), droplevels(donor2type[sample2donor[names(finalSnv)]])[isWgd])
wgdCancerHist <- sapply(u, function(x) if(nrow(x)>0){at(x$WGD,n=n)}else{rep(0,n)}, simplify="array")
allChrCancerHist <- abind::abind(allChrCancerHist, All=sapply(sapply(s, as.matrix), at, n=n, simplify="array")/23*5, WGD=wgdCancerHist, along=2)
Plot the timing histograms (deciles) per chromosome per cancer type, similar to Figure 1c in the final publication.
prgn <- RColorBrewer::brewer.pal(11,"PRGn")
set1 <- RColorBrewer::brewer.pal(9,"Set1")
col <- colorRampPalette(set1[c(4,9,3)])(n)
p <- 0
v <- table(droplevels(donor2type[sample2donor[names(finalSnv)]]))
h <- (allChrCancerHist + p) / rep(v + p, each=prod(dim(allChrCancerHist)[1:2]))
h <- aperm(h, c(2,3,1))
a <- colMeans(h[c("All","WGD"),,] * c(23/5,1)) %*% 1:n / asum(h* c(23/5,1), c(1,3))
o <- order(-a)
h <- h[,o,]
w <- v[o]>=15 & apply(h, 2, max) > 0.05*8/n
h <- h[,w,]
m <- 0.02
layout(matrix(1:prod(dim(h)[1:2]+1), ncol=dim(h)[1]+1, byrow=TRUE), height=c(rev(apply(h, 2, max))+m, 0.15), width=c(5, rep(1,dim(h)[1])))
par(mar=c(0.05,0.1,0,0.1), xpd=NA)
for(j in dim(h)[2]:0+1) for(i in 0:dim(h)[1]+1) {
#if(all(h[i,j,]==0))
if(i==1 & j !=1) {plot(NA,NA,xlab="",ylab="", xaxt="n",yaxt="n",xlim=c(0,1),ylim=c(0,1), bty="n")
text(1,0,dimnames(h)[[2]][j-1],pos=2)
next
}
if(j ==1 ){
plot(NA,NA,xlab="",ylab="", xaxt="n",yaxt="n",xlim=c(0,1),ylim=c(0,1), bty="n")
if(i==1) next
text(0.5,1,dimnames(h)[[1]][i-1],pos=1)
next
}
r <- c(0,max(h[,j-1,]+m))
par(bty=if(i==2)"L" else "n")
barplot(h[i-1,j-1,], ylim=r, width=1/n,space=0, col=rev(col), xaxt="n", yaxt="n", xlab="",ylab="", border=NA,xpd=TRUE, yaxs="i", xaxs="i", xlim=c(-0.5/n,1+0.5/n))
axis(side=1, at=c(-0.5/n,1+0.5/n), labels=c("",""), tcl=-.1)
if(i>1)
abline(v=0, col='lightgrey', lty=3)
if(i==2){
abline(h=0.05*8/n, col='lightgrey', lty=1)
axis(side=2, at=c(0,0.05*8/n), labels=c("",""), tcl=-.1)
}
}
#dev.copy2pdf(file="histTiming.pdf",width=6, height=6, pointsize=8)
vv <- v[dimnames(h)[[2]]]
vv <- vv/sum(vv)
hh <- matrix(matrix(aperm(h, c(1,3,2)), ncol=length(vv)) %*% vv, nrow=nrow(h))
rownames(hh) <- rownames(h)
It can also be instructive to study the timing histograms pan-cancer:
par(mar=c(3,3,1,1), mgp=c(2,.5,0), tcl=-0.5, bty="L", xpd=NA)
barplot(hh["WGD",], space=0, col=rev(col), xlab="Time [mutations]", ylab="Relative frequency", width=0.1, ylim=c(0,.065), yaxs='r', border=NA)
axis(side=1)
barplot(hh["All",], space=0, col=rev(col), xlab="Time [mutations]", ylab="Relative frequency", width=0.1, ylim=c(0,.065), yaxs='r', border=NA)
axis(side=1)
#dev.copy2pdf(file="histTimingPanCan.pdf",width=2, height=2, pointsize=8)
As a sanity check for the molecular timing estimates, calculate number of substitutions and timing per tumour type. There shouldn’t be a trend.
n <- nSub <- sapply(finalSnv, nrow)
n[timingInfo$timeCoamp==0] <- NA
q <- unlist(sapply(split(n, donor2type[sample2donor[names(finalSnv)]]), function(x) as.numeric(cut(x, {if(sum(!is.na(x))>1) quantile(x, seq(0,1,0.1), na.rm=TRUE) else 1:10}, include.lowest=TRUE))))
m <- match(names(finalSnv),unlist(split(names(finalSnv), donor2type[sample2donor[names(finalSnv)]])))
t <- timingInfo$timeCoamp
table(decSub=q[m], time=cut(t, seq(0,1,0.1)))
## time
## decSub (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8] (0.8,0.9] (0.9,1]
## 1 19 14 12 23 27 18 25 28 40 13
## 2 13 11 12 18 18 18 21 32 29 28
## 3 15 11 16 13 15 18 31 34 24 21
## 4 16 13 13 10 22 16 12 28 37 29
## 5 8 13 14 14 15 18 18 36 35 34
## 6 15 8 20 11 12 18 19 21 38 34
## 7 8 13 14 12 16 15 26 26 37 26
## 8 13 5 16 15 16 19 23 29 37 28
## 9 20 18 10 15 20 14 16 31 25 28
## 10 27 17 14 17 13 23 17 25 24 40
Also calculate deciles of timing per tumour type; this is an even stronger indication of independence, as hoped.
t[t==0] <- NA
r <- unlist(sapply(split(t, donor2type[sample2donor[names(finalSnv)]]), function(x) as.numeric(cut(x, {if(sum(!is.na(x))>1 & length(unique(x)) > 2) quantile(jitter(x), seq(0,1,0.1), na.rm=TRUE) else 1:10}, include.lowest=TRUE))))
table(decSub=q[m], decTime=r[m])
## decTime
## decSub 1 2 3 4 5 6 7 8 9 10
## 1 28 25 28 23 20 19 14 24 16 18
## 2 17 21 18 18 21 22 21 19 20 22
## 3 22 17 15 26 14 22 28 17 19 14
## 4 18 14 22 17 23 22 18 19 20 20
## 5 12 27 15 22 19 12 16 28 30 20
## 6 21 15 18 13 21 19 26 18 25 18
## 7 9 20 22 26 18 25 18 12 26 16
## 8 14 23 20 18 19 21 24 17 23 20
## 9 27 21 17 21 20 15 17 24 16 19
## 10 40 17 13 21 24 17 21 13 20 23
Plot the number of mutations per sample vs the average duplication time.
#pdf("timeNsub.pdf", 3, 2.5, pointsize=8)
par(mar=c(3,4,1,1), bty="n", mgp=c(2,.5,0), las=1, tcl=-.25)
d <- as.character(donor2type[sample2donor[names(finalSnv)]])
lineCol <- tissueColors
lineCol[grep("Lung", names(lineCol))] <- "black"
plot(t, nSub, log='y', bg=tissueColors[d], pch=21, xlab="Typical amplification time", ylab="", cex=.66, lwd=.5, yaxt="n", ylim=c(100,3e6))
mtext(side=2, "Number of SNVs", line=3, las=3)
u <- round(par("usr")[3:4])
a <- axisTicks(par("usr")[3:4], log=TRUE)
axis(side=2, at=a, labels=prettyNum(a))
b <- sapply(a[-length(a)], function(x) (1:10)*x)
axis(side=2, at=b, labels=rep("", length(b)), tcl=-.1)
#dev.off()
GBMs display very early timing on chromosomes 7, 20 and 21. Plot a few.
w <- which(fracGenomeWgdComp[,"time.wgd"]<0.1 & fracGenomeWgdComp[,"nt.total"]/chrOffset["MT"] > 0.1 & !isWgd & donor2type[sample2donor[names(finalBB)]]=="CNS-GBM")
plotSample(w[1])
plotSample(w[2])
plotSample(w[3])
Explore the early timing of chromosome 7 a bit more deeply. Find all GBMs with +7
w <- which(donor2type[sample2donor[names(finalSnv)]]=="CNS-GBM")
w <- w[sapply(finalBB[w], function(bb) sum(width(bb)[as.logical(seqnames(bb)==7) & bb$total_cn >= 3], na.rm=TRUE)/width(refLengths[7])>0.8)]
Plot the first 10
for(ww in w[1:10]){
finalSnv[[ww]] -> v
v <- v[seqnames(v)==7]
v <- v[which(info(v)$MajCN <= 4 & info(v)$MajCN > 1)]
n <- sum(info(v)$MutCN==info(v)$MajCN, na.rm=TRUE)
plotSample(ww, title=paste0(sample2icgc[names(finalSnv)[ww]], ", n=", n,"/",nrow(v), " SNVs pre +7"))
}
Tabulate the number of point mutations preceding +7
t <- t(sapply(w, function(ww){
finalSnv[[ww]] -> v
v <- v[seqnames(v)==7]
v <- v[which(info(v)$MajCN <= 4 & info(v)$MajCN > 1)]
n <- sum(info(v)$MutCN==info(v)$MajCN, na.rm=TRUE)
c(pre_7=n, total=nrow(v))
}))
rownames(t) <- names(finalSnv)[w]
Less than 3 early
table(t[,1]<=3)
##
## FALSE TRUE
## 7 27
Plot the distributions of early and toal point mutations on chr7 across all GBM samples with +7:
.par()
par(bty="n")
beeswarm::beeswarm(as.numeric(pmax(t,0.5)) ~ factor(rep(c("pre +7","total"), each=nrow(t))), method='hex', pch=19, xlab="", ylab="Number of SNVs", cex=0.5, log=TRUE, yaxt='n', col=set1[c(3,9)])
axis(side=2, at=c(1,10,100,1000))
axis(side=2, at=0.5, label=0)
for( i in 0:2) axis(side=2, at=c(2,3,4,5,6,7,8,9)*10^i, labels=rep("",8), tcl=-0.1)
#dev.off()
Generate xlsx output:
ExtendedDataFigure3b <- createSheet(ExtendedDataFigure3, "ExtendedDataFigure3b")
addDataFrame(t, ExtendedDataFigure3b)
Medulloblastoma showed very frequent and early i17q. Gather them all:
w <- which(donor2type[sample2donor[names(finalSnv)]]=="CNS-Medullo")
w <- w[sapply(finalBB[w], function(bb) sum(width(bb)[as.logical(seqnames(bb)==17) & bb$total_cn >= 3], na.rm=TRUE)/width(refLengths[17])>0.5)]
Plot first 10 samples
#pdf("Medullo_i17q.pdf",3.2,3.2, pointsize=8)
for(ww in w[1:10]){
finalSnv[[ww]] -> v
v <- v[seqnames(v)==17]
v <- v[which(info(v)$MajCN <= 4 & info(v)$MajCN > 1)]
n <- sum(info(v)$MutCN==info(v)$MajCN, na.rm=TRUE)
plotSample(ww, title=paste0(sample2icgc[names(finalSnv)[ww]], ", n=", n,"/",nrow(v), " SNVs pre i17q"))
}
Tabulate early and late substitutions.
t <- t(sapply(w, function(ww){
finalSnv[[ww]] -> v
v <- v[seqnames(v)==17]
v <- v[which(info(v)$MajCN <= 4 & info(v)$MajCN > 1)]
n <- sum(info(v)$MutCN==info(v)$MajCN, na.rm=TRUE)
c(pre_i17q=n, total=nrow(v))
}))
rownames(t) <- names(finalSnv)[w]
Less than 1 early
table(t[,1]<=1)
##
## FALSE TRUE
## 21 74
Plot distribution across samples
.par()
par(bty="n")
beeswarm::beeswarm(as.numeric(pmax(t,0.5) + runif(length(t))*0.25 -0.1) ~ factor(rep(c("pre i17q","total"), each=nrow(t))), method='hex', pch=19, xlab="", ylab="Number of SNVs", cex=0.5, log=TRUE, yaxt='n', col=set1[c(3,9)], corral="gutter", corralWidth=1.25)
axis(side=2, at=c(1,10,100,1000))
axis(side=2, at=0.5, label=0)
for( i in 0:2) axis(side=2, at=c(2,3,4,5,6,7,8,9)*10^i, labels=rep("",8), tcl=-0.1)
Dump xlsx output
ExtendedDataFigure3d <- createSheet(ExtendedDataFigure3, "ExtendedDataFigure3d")
addDataFrame(t, ExtendedDataFigure3d)
saveWorkbook(ExtendedDataFigure3,'ExtendedDataFigure3.xlsx')
One thing observed in the temporal classification of WGD samples was that a surprisingly high fraction of near-diploid samples also display very concordant timings. Here we provide a basic assessment of the phenomenon. The final results were further refined, more accurately considering the exact copy numeber configurations involved.
Note: Final figures have slightly deviated from this earlier version due to more elaborate handling of segments.
d <- fracGenomeWgdComp
i <- d[,"avg.ci"]<=0.5 & d[,"chr.all"] > 2 #& fracGenomeWgdComp[,"nt.total"]/chrOffset["MT"] >= 0.1
timingClass <- paste(ifelse(isWgd,"WGD","ND"), ifelse(!i, "uninformative",""))
timingClass[i] <- paste0(timingClass[i], ifelse(d[i,"nt.wgd"]/d[i,"nt.total"] > 0.75,"sync","async"))
#timingClass[i] <- paste0(timingClass[i], cut(fracGenomeWgdComp[i,"nt.wgd"]/fracGenomeWgdComp[i,"nt.total"], c(0,0.5,0.8,1), include.lowest=TRUE))
timingClass <- factor(timingClass)
First the pie chart with the timing classification
#pdf("TimingClass.pdf", 4,4)
colTime <- c("#A0C758","#6B8934","#BEC6AD","#CEB299","#CC6415","#EF7B00")
names(colTime) <- levels(timingClass)[c(4,5,6,3,2,1)]
c <- c(RColorBrewer::brewer.pal(9, "Pastel1"),"#DDDDDD")
t <- table(timingClass)[names(colTime)]
pie(t, init.angle=90, labels=paste0(names(t), ",\nn=", t), col=colTime)
#t <- table(isWgd)
par(new=TRUE)
symbols(x=0,y=0,circles=0.4, inches=FALSE, add=TRUE, bg="white")
#pie(t, labels=c("",""), col=NA, lwd=5, lty=1, init.angle=90)
#dev.off()
xlsx output
Figure1f <- createSheet(Figure1, "Figure1f")
addDataFrame(t, Figure1f)
colnames(d) <- c("ntCoamp","ntAmp","timeCoamp","segCoamp","segAmp","chrCoamp","chrAmp", "sdTimeCoamp","avgCiSeg","sdAllSeg")
timingInfo <- data.frame(avgPloidy=finalPloidy, avgHom=finalHom, isWgd=isWgd, d, informative=i, timingClass=timingClass)
#tab <- rbind(tab, data.frame(WGD_call=otherStat, WGD_timing=NA, ploidy=otherPloidy, hom=otherHom, nt.wgd=NA, nt.total=NA, time.wgd=NA, sd.wgd=NA,avg.ci=NA, sd.all=NA))
write.table(file=paste0(Sys.Date(),"-Timing-info.txt"), timingInfo, quote=FALSE, row.names=TRUE, col.names=TRUE, sep="\t")
Show 9 prototypical examples for the different timing classes.
w <- which(wgdStar=="likely" & !isWgd) # Garden variety near-diploid with concordant timing
plotSample(w[1])
plotSample(w[2])
plotSample(w[3])
w <- which(wgdStar=="very likely" & isWgd) # Now a-star WGD with highly concordant timing.
plotSample(w[1])
plotSample(w[2])
plotSample(w[9])
w <- which(wgdStar=="unlikely" & !isWgd & fracGenomeWgdComp[,"nt.total"]/chrOffset["MT"] > 0.25 & fracGenomeWgdComp[,"avg.ci"] < 0.5) # Near diploid with highly discordant timing
plotSample(w[1])
plotSample(w[2])
plotSample(w[3])
We can also time secondary gains for certain configurations. This is always possible as long only one allele is gained, such as 3:1. If two alleles are gained this required additional assumptions. For 3:2 the typical assumption would be 1:1 -> 2:2 -> 3:2. For 4:2 the assumption is 1:1 -> 2:1 -> 4:2. Here we only consider gains of a single allele, which can be uniquely timed. These analyses were conducted by Lara Jerman. Here we load her data.
Load preprocessed data, aggregated by chromsome
load("two_gain_times.RData")
doubleGains <- as.data.frame(T.i.F)
m <- paste(doubleGains$sample, doubleGains$cnMaj, doubleGains$cnMin, doubleGains$chromosome, sep="_")
s <- split(doubleGains[,c("sample","tumor_type","T1_raw","T2_raw","n_mutations")], m)
doubleGainsAggregated <- Reduce("rbind",sapply(s, function(x) {
data.frame(sample=x$sample[1], tumor_type=x$tumor_type[1], T1_raw=weighted.mean(x$T1_raw, x$n_mutations),T2_raw=weighted.mean(x$T2_raw, x$n_mutations), n_mutations=sum(x$n_mutations))
}, simplify=FALSE))
First plot all samples and chromosomes, first versus second gain.
x <- doubleGainsAggregated$T1_raw/pmax(1, doubleGainsAggregated$T2_raw)
y <- doubleGainsAggregated$T2_raw/pmax(1, doubleGainsAggregated$T2_raw)
o <- order(doubleGainsAggregated$n_mutations, decreasing=TRUE)
plot(x[o],
y[o],
bg=tissueColors[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[o]]]])], pch=21,
col=tissueBorder[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[o]]]])],
xlab="Time [mutations], first gain",
ylab="Time [mutations], second gain",
cex=sqrt(doubleGainsAggregated$n_mutations[o]/500)+0.1,
lwd=0.5)
t <- table(doubleGainsAggregated$sample)
The distribution is surprisingly independent, as one might have expected multiple copies of the same allele to be gained at the same time.
Now split by timing class - no obvious trend.
names(timingClass) <- names(finalSnv)
par(mfrow=c(2,2))
for(l in grep("uninformative",levels(timingClass), invert=TRUE, value=TRUE)){
w <- which(timingClass[doubleGains$sample]==l)
o <- intersect(order(doubleGainsAggregated$n_mutations, decreasing=TRUE),w)
plot(x[o],
y[o],
bg=tissueColors[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[o]]]])], pch=21,
col=tissueBorder[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[o]]]])],
xlab="Time [mutations], first gain",
ylab="Time [mutations], second gain",
cex=sqrt(doubleGainsAggregated$n_mutations[o]/500)+0.1)
title(main=l, line=0, font.main=1)
}
Clusters are often caused by individual samples:
par(mfrow=c(5,5))
for(i in as.numeric(names(t)[t>5])[1:25]){
w <- which(doubleGainsAggregated$sample==i)
plot(x[w],y[w],
col=tissueBorder[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[w]]]])],
bg=tissueColors[as.character(donor2type[sample2donor[names(finalSnv)[doubleGainsAggregated$sample[w]]]])],
type='p', xlim=c(0,1), ylim=c(0,1),
xlab="time 1",
ylab="time 2",
pch=21,
cex=sqrt(doubleGainsAggregated$n_mutations[w]/500+0.1))
}
To formalise the observation that the second gains occur seemingly uniformly after the first, calculate and plot the relative latency.
w <- y < 1 & x > 0
r <- ((y-x)/(1-x))
h <- hist(r[w], breaks=seq(0,1,0.025), plot=FALSE)
e <- d <- density(r[w], from=-1, to=2, bw="SJ")
i <- which(d$x < 0)
d$y[max(i) + seq_along(i)] <- d$y[max(i) + seq_along(i)] + d$y[rev(i)]
i <- which(d$x > 1)
d$y[min(i) - seq_along(i)] <- d$y[min(i) - seq_along(i)] + d$y[i]
i <- which(d$x >0 & d$x < 1)
d <- list(x=d$x[i], y=d$y[i])
plot(h$mids,h$counts/sum(h$counts), pch=19, col='grey',ylim=c(0,max(h$counts/sum(h$counts))), xlab="Latency of second gain", ylab="Relative frequency", type='h')
The majority of secondary gains indeed appear to occur independently after the second. Only about 17% occur in close succession.
plot(d$x,cumsum(d$y * diff(d$x)[1]), xlim=c(0,1), type='l', ylim=c(0,1), xlab="Relative time of second gain", ylab="CDF")
It is perhaps instructive to assess the relative latency by timing class. WGD appears underrepresented in cases with synchronous double gains.
c <- cut(r[w], 20)
t <- table(timingClass[doubleGainsAggregated$sample[w]],c)
barplot(t[names(colTime),]/sum(t), border=NA, col=colTime, width=1/24, space=0.2, names.arg=rep("",20, bty="L", yaxs="s"))
.par()
axis(side=1, line=0.2)
xlxs output
Figure1h <- createSheet(Figure1, "Figure1h")
addDataFrame(t[names(colTime),], Figure1h)
Lastly, simulate higher order gains to cross-check whether the inference is credible.
n <- 100
c <- 40
purity=0.7
bb <- GRanges(seqnames=1, IRanges(1,1e8), major_cn=3, minor_cn=1, clonal_frequency=purity, n.snv_mnv=n)
bb$total_cn <- bb$major_cn+bb$minor_cn
t3 <- 0.8
t2 <- 0.8 # Simultaneous second amplification
d <- data.frame(cluster=1, n_ssms=1, proportion=purity)
cnStates <- defineMcnStates(bb,clusters=d, purity=purity)
pi <- matrix(c(4,2,1,0,1,0,0,0,1), byrow=TRUE, ncol=3) %*% c(1-t2,t2-t3,t3)
pi <- pi/sum(pi)
cnStates[[1]][,"P.m.sX"] <- pi
rho=0.01
cnStates[[1]][,"power.m.s"] <- 1-pbetabinom(2, size=c, prob=cnStates[[1]][,"f"], rho=rho )
bb$timing_param <- cnStates
s <- simplify2array(mclapply(1:50, function(foo){
set.seed(foo)
v <- simulateMutations(bb, n=40)
bb0 <- bb
bb0$timing_param <- NULL
L <- computeMutCn(v, bb0, clusters=d, purity=purity, rho=rho, n.boot=0)
L$P[[1]]
}))
boxplot(t(s[2:3,"T.m.sX",]), at=3:2, xlab="Simulated time point", names=c("t2","t3"))
points(3:2,c(t2-t3, t3), col='red', pch=19)
l <- s[2,"T.m.sX",]/(1-s[3,"T.m.sX",])
x <- seq(0,1,0.05)
plot(x[-1]+x[2]/2, as.numeric(prop.table(table(cut(l[l<1], x)))), xlab="Latency", ylab="frequency", type='h')
axis(side=1)
We also calculate how often higher order gains occur in each timing class. To this end, to a tally of all copy number gains.
cn <- do.call("rbind", sapply(names(finalBB), function(n){
bb <- finalBB[[n]]
data.frame(sample=n, chr=seqnames(bb), width=width(bb), M=bb$major_cn, m=bb$minor_cn)}, simplify=FALSE))
How does this relate to the length of the segments?
t <- table(pmin(cn$M,3) , pmax(3,round(log10(cn$width),1)), timingClass[cn$sample])
x <- as.numeric(colnames(t))
plot(NA,NA, type='p', col=colTime[1], pch=16, ylim=c(0,0.8), xlim=range(10^x), xlab="Length of segment", ylab="Proportion >2 allelic copies", log='x')
for(n in dimnames(t)[[3]]) {
y <- as.numeric(t[4,,n]/colSums(t[3:4,,n]))
lines(10^x,y, type='p', col=paste0(colTime[n],"44"), pch=16, cex=1)#sqrt(colSums(t[3:4,,i]/1000)))
lines(10^x, predict(loess(y ~x)), col=colTime[n], lwd=2)
}
Plot the fraction of samples with double gains for each timing class. Asynchronous samples appear overrepresented.
tt <- mg14:::asum(t[,x>=7,],2)
o <- names(colTime)
p <- tt[4,o]/colSums(tt[3:4,o])
ci <- sapply(c(0.025, 0.975), qbeta, 0.025, shape1=tt[4,o]+1, shape2=tt[3,o]+1)
barplot(p, col=colTime, border=NA, las=2, ylab="Proportion >2 allelic copies", names=sub("ormative","",sub("near-diploid", "ND", names(colTime))), ylim=c(0,0.4)) -> b
segments(b, ci[,1], b, ci[,2])
Produce xlsx output
Figure1g <- createSheet(Figure1, "Figure1g")
addDataFrame(t(tt[3:4,o]), Figure1g)
Here we calculate approximate real time estimates of WGD and MRCA using only CpG>TpG mutations, which are found in every tumoyr type and vary relatively little between samples.
The approximate chronological timing is basd only CpG>TpG mutations, which universally occur in all tissues, are used as a precautionary measure. We first study the burden of CpG>TpG mutations (RpCpG>RpTpG in Melanoma) in each sample across tissues and as a function of age. This enables us to remove some hypermutant samples and derive a range of possible mutation rate increase.
Branch lengths need to be adjusted for time-dependent copy number as well as the structure of the subclonal phyllogeny.
Get the age of every donor.
age <- clinicalData$donor_age_at_diagnosis
names(age) <- clinicalData$icgc_donor_id
Exclude these cancer types:
typeNa <- gsub("\t","",strsplit("Bone-Cart
Breast-LobularCa
Breast-DCIS
Lymph-NOS
Myeloid-MDS
Cervix-AdenoCa", "\n")[[1]])
Calculate effective genome size, i.e. time-averaged ploidy from mutation copy numbers. This is useful to convert mutation counts into approximate rates.
effGenome <- unlist(mclapply(finalSnv, function(vcf) {
w <- info(vcf)$CLS!="subclonal"
if(donor2type[sample2donor[meta(header(vcf))$META["ID",]]]=="Skin-Melanoma")
w <- w & isDeaminationNoUV(vcf)
else
w <- w & isDeamination(vcf)
2/avgWeights(vcf[na.omit(w)])
}))
names(effGenome) <- names(finalSnv)
Also calculate the power to detect variants for different subclones to extrapolate their true branch length.
finalPower <- sapply(names(finalBB), function(n) {
x <- finalBB[[n]]
f <- finalClusters[[n]]$proportion
for(i in 1:length(x)){
t <- x$timing_param[[i]]
p <- t[match(f, t[,"cfi"]), "power.s"]
if(!is.null(p)) if(all(!is.na(p))) break
}
if(is.null(p)) return(rep(NA, length(f)))
return(p)
})
#plot(unlist(lapply(wccClusters[names(finalSnv)], `[[`, "n_ssms")),unlist(lapply(finalClusters[names(finalSnv)], `[[`, "n_ssms"))/ unlist(finalPower), log='xy',
# xlab="Cluster size WCC (consensus)", ylab="Cluster size MutationTime.R")
The following calculates the length of the trunk (clonal mutations) and the depth of the MRCA, scaled by power and using a densitly branching subclonal phylogeny. Scaling branches by 1/f assumes the densest possible branching topology and this the shortest latency.
branchDeam <- t(simplify2array(mclapply(finalSnv, function(vcf) {
n <- meta(header(vcf))$META["ID",]
if(donor2type[sample2donor[n]]=="Skin-Melanoma")
w <- isDeaminationNoUV(vcf)
else
w <- isDeamination(vcf)
if(sum(w)==0) return(c(0,0))
p <- info(vcf)$pSub[w];
n.subclonal <- aggregate(p, list(info(vcf)$CNF[w]), sum, na.rm=TRUE)
m <- apply(abs(outer(n.subclonal$Group.1, finalClusters[[n]]$proportion, `-`)),1,which.min) # Match to subclones
p.subclonal <- finalPower[[n]][m] # Power of subclones
b.subclonal <- n.subclonal$x %*% (n.subclonal$Group.1 / p.subclonal) / max(n.subclonal$Group.1) # Subclonal branch, power adjusted & 1/f-scaled
b.clonal <- sum(1-p, na.rm=TRUE)/finalPower[[n]][1] # Clonal branch (trunk), power adjusted & 1/f-scaled
c(b.subclonal, b.clonal)})))
d <- droplevels(donor2type[sample2donor[names(finalSnv)]])
typesSubclones <- setdiff(levels(d), c(typeNa, names(which(table(d)<5))))
nClones <- sapply(finalClusters, nrow)
Comparison to linear branching in which the branch lengths of all subclonal populations is added. This constitutes the other extreme, corresponding to the maximal latency.
branchDeamLinear <- t(simplify2array(mclapply(finalSnv, function(vcf) {
if(donor2type[sample2donor[meta(header(vcf))$META["ID",]]]=="Skin-Melanoma")
w <- isDeaminationNoUV(vcf)
else
w <- isDeamination(vcf)
if(sum(w)==0) return(c(0,0))
n <- meta(header(vcf))$META["ID",]
if(donor2type[sample2donor[n]]=="Skin-Melanoma")
w <- isDeaminationNoUV(vcf)
else
w <- isDeamination(vcf)
if(sum(w)==0) return(c(0,0))
p <- info(vcf)$pSub[w];
n.subclonal <- aggregate(p, list(info(vcf)$CNF[w]), sum, na.rm=TRUE)
m <- apply(abs(outer(n.subclonal$Group.1, finalClusters[[n]]$proportion, `-`)),1,which.min) # Match to subclones
p.subclonal <- finalPower[[n]][m] # Power of subclones
b.subclonal <- n.subclonal$x %*% (1 / p.subclonal) # Subclonal branch, power adjusted
b.clonal <- sum(1-p, na.rm=TRUE)/finalPower[[n]][1] # Clonal branch (trunk), power adjusted
c(b.subclonal, b.clonal)})))
Compare branching and linear topologies:
f <- (branchDeam[,1] / finalPloidy) / rowSums(branchDeam / cbind(finalPloidy, effGenome))
l <- (branchDeamLinear[,1]/ finalPloidy)/rowSums(branchDeamLinear / cbind(finalPloidy, effGenome))
t <- donor2type[sample2donor[names(finalSnv)]]
plot(f, l, xlab="Subclonal branch length (branching)", ylab="Subclonal branch length (linear)", pch=21, bg=tissueColors[t], col=tissueBorder[t], cex=tissueCex[t])
abline(0,1, lty=2)
quantile(l/f, na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 1.000000 1.230433 1.787160 2.324245 18.979107
quantile(l, na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 0.00000000 0.01893107 0.09696972 0.27966436 0.98292792
quantile(f, na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 0.000000000 0.009730081 0.045141939 0.135827609 0.955712890
Here we convert branch lengths into average mutation rates (assuming no change, ie avgmu=n/age). First study relation of mutation burden to age, then calculate average mutation rates, exclude hypermutators and also samples with tumour in normal 1% for further analyses.
Plot a large overview:
rateDeam <- cc <- list()
remove <- "8454fe53-869d-41c8-b0c8-a7929d00eec3" # a cell line, add more samples in the following
par(mfrow=c(6,6), mar=c(3,3,2,1),mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="L", xpd=FALSE, las=1, xpd=FALSE)
for(n in typesSubclones){
i <- d==n
tt0 <- branchDeam[i,]/cbind(finalPloidy[i], effGenome[i]) / 3#cbind(nClones[i]-1, 1)/3 # 3Gb Haploid genome
tt0[is.infinite(tt0)|is.nan(tt0)] <- 0
yy <- rowSums(tt0)
a <- age[sample2donor[names(finalSnv)[i]]]
xx <- a
r <- yy/xx
m <- median(r[TiN[names(xx)] <= 0.01 & ! is.na(TiN[names(xx)])],na.rm=TRUE)
rateDeam[[n]] <- r
try({
w <- (r-m)^2/m^2 <= 2^2 & TiN[names(xx)] <= 0.01 & ! is.na(TiN[names(xx)])
remove <- c(remove, names(which(!w)))
plot(xx, yy, bg=tissueColors[n], col=tissueBorder[n], pch=NA, log='', xlab="Age at diagnosis", ylab="SNVs/Gb", main=n, ylim=c(0,pmin(1000,max(yy, na.rm=TRUE))), xlim=c(0,max(age, na.rm=TRUE)), cex.main=1)
#par(xpd=NA)
#segments(x0=0,y0=0, xx, yy, col=tissueLines[n], lty=tissueLty[n])
points(xx, yy, bg=tissueColors[n], col=ifelse(w,tissueBorder[n], tissueColors[n]), pch=ifelse(w,21,4))
abline(0, m, lty=3)
#lines(c(x0,2*x0), c(0,1))
#print(paste(n,cor(xx[w],yy[w],use='c'), cor(xx[w],tt0[w,] %*% c(0.2,1), use='c'), sep=": "))
l <- lm(yy~xx, data=data.frame(yy=yy[w],xx=xx[w], row.names=as.character(seq_along(yy[w]))))
f <- (summary(l))
p <- predict(l,newdata=data.frame(xx=seq(0,100,1)), se=TRUE)
polygon(c(seq(0,100,1),rev(seq(0,100,1))), c(p$fit - 2*p$se.fit, rev(p$fit+2*p$se.fit)), border=tissueBorder[n], col=paste0(tissueColors[n],"44"))
v <- which(!is.na(xx[w]*yy[w]))
cc[[n]] <- cbind(f$coefficients, f$cov.unscaled * f$sigma^2, coef(nnls::nnls(cbind(1,xx[w][v]), yy[w][v])))
})
}
## Warning in predict.lm(l, newdata = data.frame(xx = seq(0, 100, 1)), se = TRUE): prediction from a rank-deficient fit may
## be misleading
## Warning in cbind(...): number of rows of result is not a multiple of vector length (arg 3)
n <- names(rateDeam)
qRateDeam <- sapply(rateDeam, function(r){
m <- median(r[TiN[sample2donor[names(r)]] <= 0.01 & ! is.na(TiN[sample2donor[names(r)]])],na.rm=TRUE)
w <- (r-m)^2/m^2 <= 2^2 & TiN[sample2donor[names(r)]] <= 0.01 & ! is.na(TiN[sample2donor[names(r)]])
quantile(r[w], na.rm=TRUE)})
plot(sapply(rateDeam, median, na.rm=TRUE), pch=NA , ylab="SNVs/Gb/yr", main="CpG>TpG rate", ylim=c(0, max(qRateDeam)), cex.main=1, xaxt='n', xlab="Tumour type")
segments(seq_along(rateDeam),qRateDeam["0%",],seq_along(rateDeam), qRateDeam["100%",], col=tissueLines[n], lty=1)
points(sapply(rateDeam, median, na.rm=TRUE), pch=21, col=tissueBorder[n], bg=tissueColors[n])
length(remove)
## [1] 743
Plot the average rates as barplot across tissues. Relatively little variation.
par(mar=c(6,3,1,1))
o <- order(qRateDeam["50%",])
barplot(qRateDeam["50%",][o], col=tissueColors[colnames(qRateDeam)][o], border=tissueLines[colnames(qRateDeam)][o], las=2,names.arg=rep("",ncol(qRateDeam)) , ylab="CpG>TpG rate [SNVs/Gb/yr]", ylim=c(0, max(qRateDeam))) -> b
mg14::rotatedLabel(b, labels=colnames(qRateDeam)[o])
segments(b, qRateDeam["50%",][o], b, qRateDeam["100%",][o], col=tissueLines[colnames(qRateDeam)][o], lwd=2)
segments(b, qRateDeam["0%",][o], b, qRateDeam["50%",][o], col=tissueBorder[colnames(qRateDeam)][o], lwd=2)
Plot the copynumber and ITH-adjusted mutation burden versus age and add a loess fit.
tt0 <- branchDeam/cbind(finalPloidy, effGenome) / cbind(nClones-1, 1)/3 # 3Gb Haploid genome
tt0[is.infinite(tt0)|is.nan(tt0)] <- 0
m <- sapply(rateDeam, function(r){m <- median(r[TiN[sample2donor[names(r)]] <= 0.01 & ! is.na(TiN[sample2donor[names(r)]])],na.rm=TRUE)})
s <- rowSums(tt0)#/m[as.character(donor2type[sample2donor[names(finalSnv)]])]
s[remove] <- NA
t <- donor2type[sample2donor[names(finalSnv)]]
x <- age[sample2donor[names(finalSnv)]]
plot(x,s, bg=tissueColors[t], pch=21, ylim=c(0,1000), col=tissueBorder[t], cex=tissueCex[t]*2/3, lwd=0.25, xlab="Age", ylab="SNVs/Gb")
p <- predict(loess(s~x), newdata=sort(x, na.last=NA), se=TRUE)
r <- function(x) c(x, rev(x))
polygon(r(sort(x, na.last=NA)), c(p$fit+2*p$se, rev(p$fit-2*p$se)), col="#00000044", border=NA)
lines(sort(x, na.last=NA),p$fit)
#s <- 12/8; dev.copy2pdf(file="timeSubcloneAgePancan.pdf", width=2*s, height=2*s, pointsize=8*s)
xlsx
ExtendedDataFigure8a <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8a")
tab <- data.frame(age=x, `CpG>TpG/Gb`=s, tumour_type=t)
addDataFrame(tab, ExtendedDataFigure8a)
Is there a positive intercept in a linear mutation v age fit?
a <- simplify2array(cc[!names(cc) %in% c("Myeloid-AML","Bone-Epith")])
all(a[1,1,] + 2*a[1,2,]>0 )
## [1] TRUE
Positive slope?
all(na.omit(a[2,1,] + 2*a[2,2,]>0))
## [1] TRUE
plot(a[1,1,], a[2,1,], col=tissueColors[dimnames(a)[[3]]], pch=NA, xlab="Offset", ylab="SNVs/Gb/yr")
segments(a[1,1,], a[2,1,] - a[2,2,],a[1,1,], a[2,1,]+a[2,2,], col=tissueLines[dimnames(a)[[3]]], pch=19)
segments(a[1,1,]-a[1,2,], a[2,1,], a[1,1,]+a[1,2,], a[2,1,], col=tissueLines[dimnames(a)[[3]]], pch=19)
points(a[1,1,], a[2,1,], pch=21, bg=tissueColors[dimnames(a)[[3]]], col=tissueLines[dimnames(a)[[3]]])
abline(h=0, lty=3)
abline(v=0, lty=3)
The two quantities above can be used to calculate a fraction of mutations due to linear accumulation
par(mar=c(6,3,1,1))
ma <- sapply(split(age, donor2type[names(age)]), median, na.rm=TRUE)
fm <- pmax(a[2,7,],0)*ma[dimnames(a)[[3]]]/(pmax(0,a[2,7,])*ma[dimnames(a)[[3]]] + pmax(0,a[1,7,]))*100
o <- order(fm)
fmq <- sapply(names(fm), function(n){
aa <- mvtnorm::rmvnorm(10000, mean=a[,1,n], sigma=a[,5:6,n] )
aa <- aa[aa[,1]>=0 & aa[,2]>=0,]
quantile(pmax(aa[,2],0)*ma[n]/(pmax(0,aa[,2])*ma[n] + pmax(0,aa[,1])), c(0.025, 0.975))
}) *100
barplot(fm[o], col=tissueColors[dimnames(a)[[3]]][o], border=tissueLines[dimnames(a)[[3]]][o], las=2,names.arg=rep("",length(fm)) , ylab="Age-attributed mutations [%]") -> b
mg14::rotatedLabel(b, labels=names(fm[o]))
segments(b, fm[o], b, fmq[2,o], col=tissueLines[dimnames(a)[[3]]][o], lwd=2)
segments(b, fmq[1,o], b, fm[o], col=tissueBorder[dimnames(a)[[3]]][o], lwd=2)
abline(h=min(fmq[2,]))
abline(h=max(fmq[1,]))
As the OLS fits per cancer type suggest a robust linear trend with small positive offsets, we repeat the analysis using a hierarchical Bayesian model. This enables us to share some information across cancer types, implicitly encode the positivity constraints and also propagate uncertainty into functions of the inferred parameters, such as the fraction of mutations due to linear accumulation.
Prepare data for rstan
library(rstan)
y <- Reduce("c",rateDeam)
y <- y[!names(y) %in% remove]
x <- age[sample2donor[names(y)]]
y <- y*x
t <- donor2type[sample2donor[names(y)]]
d <- data.frame(x,y,t)
df <- d
df <- df[rowSums(is.na(df))==0,]
tt <- model.matrix(~droplevels(t)-1, data=df)
data <- list(n = nrow(df),
p = ncol(tt),
y = df$y,
x = tt * df$x,
t = tt
)
The actual stan model definition is defined in ./PCAWG-rates.stan:
data {
int<lower=0> n; // numbr of observations
int<lower=0> p; // number of types
vector<lower=0>[n] y; // mutations
matrix[n,p] x; // age
matrix[n,p] t; // tumour type
}
parameters {
real<lower=0> sigma; // const. variance
real<lower=0> tau; // time-dep variance
real<lower=0> alpha; // alpha for slope
real<lower=0> beta; // beta for slope
real<lower=0> gamma;
real<lower=0> delta;
vector<lower=0>[p] offset;
vector<lower=0>[p] slope;
}
transformed parameters {
vector[n] mu;
vector[n] nu;
vector[p] ones;
ones = rep_vector(1, p);
mu = x * slope + t * offset;
nu = sqrt( square(x * ones) * tau^2 + sigma^2);
}
model {
slope ~ gamma(alpha, beta);
offset ~ gamma(gamma, delta);
y ~ normal(mu, nu);
}
Now fit the model using Hamiltonian Monte Carlo
fit <- stan(
file = "PCAWG-rates.stan", # Stan program
data = data, # named list of data
chains = 1, # number of Markov chains
warmup = 1000, # number of warmup iterations per chain
iter = 2000, # total number of iterations per chain
cores = 1, # number of cores
refresh = 1000, # show progress every 'refresh' iterations
open_progress=FALSE,
seed=42
)
##
## SAMPLING FOR MODEL 'PCAWG-rates' NOW (CHAIN 1).
##
## Gradient evaluation took 0 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Adjust your expectations accordingly!
##
##
## Iteration: 1 / 2000 [ 0%] (Warmup)
## Iteration: 1000 / 2000 [ 50%] (Warmup)
## Iteration: 1001 / 2000 [ 50%] (Sampling)
## Iteration: 2000 / 2000 [100%] (Sampling)
##
## Elapsed Time: 448.87 seconds (Warm-up)
## 274.73 seconds (Sampling)
## 723.6 seconds (Total)
Collect parameters
s <- summary(fit, pars=c("offset","slope"))$summary
ab <- array(s, dim=c(ncol(tt),2,10), dimnames=list(levels(droplevels(t)), c("a","b"), colnames(s)))
Summary plot
plot(x,y, bg=tissueColors[t], pch=21, ylim=c(0,1000), col=tissueBorder[t], cex=tissueCex[t]*2/3, lwd=0.25, xlab="Age", ylab="SNVs/Gb")
for(i in 1:nrow(ab))
abline(ab[i,1,"50%"], ab[i,2,"50%"], col=tissueLines[levels(droplevels(t))[i]], lty=tissueLty[levels(droplevels(t))[i]])
Plot inferred rates and offsets with 95% credible intervals.
plot(ab[,1,"50%"], ab[,2,"50%"], col=tissueColors[dimnames(ab)[[1]]], pch=NA, xlab="Offset", ylab="SNVs/Gb/yr", xlim=range(ab[,1,c("2.5%","97.5%")]), ylim=range(ab[,2,c("2.5%","97.5%")]))
segments(ab[,1,"50%"], ab[,2,"2.5%"],ab[,1,"50%"], ab[,2,"97.5%"], col=tissueLines[dimnames(ab)[[1]]], pch=19)
segments(ab[,1,"2.5%"], ab[,2,"50%"],ab[,1,"97.5%"], ab[,2,"50%"], col=tissueLines[dimnames(ab)[[1]]], pch=19)
points(ab[,1,"50%"], ab[,2,"50%"], pch=21, bg=tissueColors[dimnames(ab)[[1]]], col=tissueLines[dimnames(ab)[[1]]])
abline(h=0, lty=3)
abline(v=0, lty=3)
a <- extract(fit, pars="offset")$offset
b <- extract(fit, pars="slope")$slope
colnames(a) <- colnames(b) <- levels(droplevels(t))
xlsx ouput
ExtendedDataFigure8c <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8c")
addDataFrame(ab, ExtendedDataFigure8c)
Create a large overview panel with all data points used and the Bayesian linear fits with 95% CIs.
par(mfrow=c(6,6), mar=c(3,3,2,1),mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="L", xpd=FALSE, las=1, xpd=FALSE)
d <- droplevels(t)
for(n in levels(d)){
i <- d==n
yy <- y[i]
xx <- x[i]
m <- median(yy/xx, na.rm=TRUE)
try({
plot(xx, yy, bg=tissueColors[n], col=tissueBorder[n], pch=21, log='', xlab="Age at diagnosis", ylab="SNVs/Gb", main=n, ylim=c(0,pmin(1000,max(yy, na.rm=TRUE))), xlim=c(0,max(x, na.rm=TRUE)), cex.main=1)
#points(xx, yy, bg=tissueColors[n], col=ifelse(w,tissueBorder[n], tissueColors[n]), pch=ifelse(w,21,4))
abline(0, m, lty=3)
x0 <- seq(0,100,1)
p <- apply(sapply(x0, function(x) a[,n] + b[,n]*x), 2, quantile, c(0.025,0.975), na.rm=TRUE)
polygon(c(x0,rev(x0)), c(p["2.5%",], rev(p["97.5%",])), border=tissueBorder[n], col=paste0(tissueColors[n],"44"))
})
}
xlsx outputs.
ExtendedDataFigure8b <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8b")
tab <- data.frame(age=x, `CpG>TpG/Gb`=y, tumour_type=d)
addDataFrame(tab, ExtendedDataFigure8c)
Calculate the fraction of mutations due to linear accumulation using the inferred parameters and propagate the error from the HMC samples.
q <- sapply(colnames(a), function(n){
w <- which(t==n & !is.na(y))
f <- sapply(w, function(j) x[j] * b[,n] / (a[,n] + x[j] * b[,n]))
quantile(rowMeans(f), c(0.025, 0.25, .5,.75,.975))
})*100
qPanCan=quantile(rowMeans(do.call("cbind",sapply(colnames(a), function(n){
w <- which(d==n & !is.na(y))
f <- sapply(w, function(j) x[j] * b[,n] / (a[,n] + x[j] * b[,n]))
}))),
c(0.025, 0.25, .5,.75,.975))*100
Plot the fraction of mutations due to linear accumulation using bar plots with 95% CIs, sorted.
par(mar=c(6,3,1,1))
o <- order(q["50%",])
barplot(q["50%",o], col=tissueColors[colnames(q)][o], border=tissueLines[colnames(q)][o], las=2,names.arg=rep("",length(q["50%",])) , ylab="Age-attributed mutations [%]", ylim=c(0,100)) -> b
mg14::rotatedLabel(b, labels=names(q["50%",o]))
segments(b, q["50%",][o], b, q["97.5%",o], col=tissueLines[colnames(q)][o], lwd=2)
segments(b, q["2.5%",o], b, q["50%",][o], col=tissueBorder[colnames(q)][o], lwd=2)
abline(h=min(q["97.5%",]), lty=3)
abline(h=max(q["2.5%",]), lty=3)
#abline(h=qPanCan["50%"], lty=4)
xlsx output
ExtendedDataFigure8e <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8e")
addDataFrame(t(q), ExtendedDataFigure8e)
Quickly assess whether this fraction qualitative agrees with simulations using a 5x range increase occuring between 0-15yrs prior to sampling.
set.seed(42)
x <- df$x
a <- runif(length(x), pmax(0.5, 1-15/x), 1) #
r <- rgamma(length(x), 10, 10)
y <- rpois(length(x), (a + (1-a)*5) * r * x * 6 * 0.2)/6
y[x < 40] <- NA
plot(x,y, ylim=c(0,max(y, na.rm=TRUE)), xlab="Age", ylab="SNVs/Gb", pch=21, bg="grey", lwd=0.5)
f <- lm(y~x)
summary(f)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.639 -5.169 -0.889 4.233 31.682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.01811 1.01079 4.965 7.58e-07 ***
## x 0.21817 0.01577 13.832 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.286 on 1696 degrees of freedom
## (280 observations deleted due to missingness)
## Multiple R-squared: 0.1014, Adjusted R-squared: 0.1008
## F-statistic: 191.3 on 1 and 1696 DF, p-value: < 2.2e-16
c <- coef(f)
abline(c)
mean(c[1] / (c[1]+ x*c[2]), na.rm=TRUE)
## [1] 0.3140266
Load a table with data from cancer relapse samples in 8 tissues. This data was collected from 8 separate studies and compiled by Santiago Gonzalez.
relapse <- read.table("../data/2018_05_acceleration.txt", header=TRUE, sep="\t")
head(relapse)
## ID Ttype Acc_CpG Acc_max Acc_min Primary.purity Relapse.purity Primary_CpG_muts Relapse_CpG_muts shared
## 1 DO46329 Ovarian 5.843684 7.304605 5.478454 NA NA 405 604 0.8765432
## 2 DO46344 Ovarian 7.977252 8.863613 7.977252 NA NA 341 409 0.8416422
## 3 DO46366 Ovarian 7.165330 8.429799 7.165330 NA NA 442 710 0.9162896
## 4 DO46368 Ovarian 6.705564 7.058488 5.646791 NA NA 332 622 0.8674699
## 5 DO46374 Ovarian 5.217451 5.217451 2.820244 NA NA 751 964 0.8788282
## 6 DO46380 Ovarian 3.469097 3.854552 3.276369 NA NA 444 540 0.6666667
## Primary_age Relapse_time Primary_ploidy Relapse_ploidy Primary_effGenome
## 1 52 4.372348 1.6 NA 1.527314
## 2 46 1.149897 1.8 NA 1.793000
## 3 65 5.500342 1.7 NA 1.788664
## 4 57 7.425051 1.9 NA 1.663099
## 5 69 3.750856 3.7 NA 3.212753
## 6 51 3.178645 1.8 NA 1.693212
Now caculate branch lengths to visualise these data, adjusted for (time-dependent) ploidy. To this end first the ploidy in the primary branch.
p <- relapse$Primary_ploidy
p[is.na(p)] <- 2
And for the MRCA. Impute effective genome if timing is missing using typical scale factor for effective genome (this is just for graphical purposes).
m <- p
m[m>2.8 & is.na(relapse$Primary_effGenome)] <- m[m>2.8 & is.na(relapse$Primary_effGenome)] * 0.7
m[m>2.8 & !is.na(relapse$Primary_effGenome)] <- relapse$Primary_effGenome[m>2.8 & !is.na(relapse$Primary_effGenome)]
Ploidy in the relapse branch
r <- relapse$Relapse_ploidy
r[is.na(r)] <- relapse$Primary_ploidy[is.na(r)]
r[is.na(r)] <- 2
Branch lengths, adjusted for ploidy
b_mrca <- relapse$Primary_CpG_muts * relapse$shared / 3 /m
b_relapse <- (relapse$Relapse_CpG_muts - relapse$Primary_CpG_muts * relapse$shared)/ 3/ r
b_primary <- (relapse$Primary_CpG_muts - relapse$Primary_CpG_muts * relapse$shared) / 3 /p
Corresponding time points
t_primary <- relapse$Primary_age
t_relapse <- t_primary + relapse$Relapse_time
t_mrca <- 0.5 * (t_primary * b_mrca/(b_mrca + b_primary) + pmin(t_primary, t_relapse * b_mrca/(b_mrca + b_primary) ))
#pdf("relapse_accel_branch.pdf", pointsize=8, width=4, height=2)
.par()
par(mfrow=c(1,2))
for(t in c("Ovarian", "Breast")){
w <- which(relapse$Ttype==t)
r <- relapse[w,]
plot(r$Primary_age+r$Relapse_time, r$Relapse_CpG_muts, ylim=c(0,150), xlim=c(0,80), pch=NA, xlab="Age", ylab="CpG>TpG/Gb")
tt <- c(Ovarian="Ovary-AdenoCa",Breast="Breast-AdenoCa")[t]
title(main=tt, line=0)
c <- tissueColors[tt]
#mrca <- r$Primary_CpG_muts*r$shared
#points(mrca_time, mrca)
segments(0,0, t_mrca[w],b_mrca[w], col=c)
segments(t_mrca[w], b_mrca[w], t_primary[w], (b_mrca+b_primary)[w], col=c)
segments(t_mrca[w], b_mrca[w], t_relapse[w], (b_mrca+b_relapse)[w], col=c)
points(t_primary[w], b_mrca[w]+b_primary[w], pch=21, bg=c, col="white")
points(t_relapse[w], (b_mrca+b_relapse)[w], pch=21, bg=c, col="white")
}
#dev.off()
ExtendedDataFigure8f <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8f")
tab <- data.frame(t_mrca, t_primary, t_relapse, b_mrca, b_primary, b_relapse, type=relapse$Ttype, ID=relapse$ID)
addDataFrame(tab[which(relapse$Ttype=="Ovarian"),], ExtendedDataFigure8f)
ExtendedDataFigure8g <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8g")
addDataFrame(tab[which(relapse$Ttype=="Breast"),], ExtendedDataFigure8g)
The acceleration - or mutation rate increase - is effectively the ratio of the ploidy adjusted CpG>TpG branch-length in the relase sample versus the MRCA. See Supplementary Methods for details.
Plot the acceleration values inferred from CpG>TpG mutations in each branch
#pdf('foo.pdf', 2,2, pointsize=8)
.par()
par(mar=c(5,3,1,1))
x <- relapse$Ttype
levels(x) <- c("Myeloid-AML","Breast-AdenoCa","CNS-LGG","Liver-HCC","Lung-AdenoCa","CNS-Medullo","Lymph-BNHL","Ovary-AdenoCa")
x <- as.character(x)
c <- tissueColors; c["CNS-LGG"] <- "black"
b <- tissueBorder; b["CNS-LGG"] <- "white"
t <- tissueCex; t["CNS-LGG"] <- 1
j <- jitter(as.numeric(factor(x)))
plot(NA,NA, xlim=c(.5,8), xaxt="n", xlab="", ylab="Acceleration", ylim=c(0,15))
segments(j, relapse$Acc_min, j, relapse$Acc_max, col=c[x])
points(Acc_CpG ~ j, data=relapse, bg=c[x], col=b[x], pch=21, cex=t[x])
abline(h=1, lty=3)
mg14::rotatedLabel(labels=levels(factor(x)))
#dev.off()
ExtendedDataFigure8h <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8h")
addDataFrame(relapse, ExtendedDataFigure8h)
The surplus of mutations in relapse samples (after adjusting for the depth of branching) shows a very good correlation with the time difference between primary and relapse sample
#pdf("relapse_time.pdf", pointsize=8, width=2, height=2)
.par()
plot(t_relapse - t_primary, b_relapse-b_primary, xlim=c(0,13), xlab="Time to relapse", ylab="Surplus in relapse, CpG>TpG/Gb", pch=16)
w <- t_relapse - t_primary < 10
abline(lm((b_relapse - b_primary)[w] ~ (t_relapse-t_primary)[w]))
legend("topright", c(paste("rho =",round(cor(t_relapse - t_primary, b_relapse-b_primary, method='s'),2)),
paste("p =",signif(cor.test(t_relapse - t_primary, b_relapse-b_primary, method='s')$p.value,1))), bty="n")
## Warning in cor.test.default(t_relapse - t_primary, b_relapse - b_primary, : Cannot compute exact p-value with ties
#dev.off()
Here we compare the infereed CpG>TpG baseline rates to data from four different studies on normal tissues.
Data of blood colonies from a 59yr-old individual. Taken from Lee-Six et al. Nature 2018
hsc_data <- read.table(gzfile("../data/Shearwater_calls_FDR0.95_all_muts.txt.gz"), header=TRUE, sep="\t")
ct <- which((hsc_data$REF=="C" & hsc_data$ALT=="T") | (hsc_data$REF=="G" & hsc_data$ALT=="A" ))
v <- VRanges(seqnames = hsc_data$X.CHROM, ref=hsc_data$REF, alt=hsc_data$ALT, ranges=IRanges(hsc_data$POS, width=1))
tnc=scanFa(file=refFile, resize(granges(v), 3,fix="center"))
cpgtpg <- grepl("(A.CG)|(T..CG)", paste(alt(v),tnc))
n_cpgtpg <- colSums(hsc_data[cpgtpg,5:144], na.rm=TRUE)
normal_hsc_cpgtpg <- quantile(n_cpgtpg/59/6, c( 0.5, 0.025,0.975))
Colonic crypts of several individuals. Data taken from Lee-Six et al. bioRxiv 2018, https://www.biorxiv.org/content/10.1101/416800v1.
colon_sbs <- read.table("../data/model_input_with_CtoTatCpG.txt", header=TRUE, sep="\t")
foo <- as.data.frame(summary(lm(CtoTatCpG/6 ~ age-1, data=colon_sbs))$coef)
normal_colon_cpgtpg <- quantile(colon_sbs$CtoTatCpG/colon_sbs$age/6, c( 0.5, 0.025,0.975))#c(foo$Estimate, foo$Estimate - 2*foo$`Std. Error`, foo$Estimate + 2*foo$`Std. Error`)
plot(colon_sbs$age, colon_sbs$CtoTatCpG/6, xlab="Age", ylab="CpG>TpG/Gb", pch=16)
Normal endometrial glands. Data from Moore et al., bioRxiv 2018, https://www.biorxiv.org/content/10.1101/505685v1.
tab <- read.table("../data/endom_subs.txt", sep="\t", header=TRUE)
quantile(tab$C.T.at.CpG/tab$Age, c(0.5, 0.025, 0.975))/6
## 50% 2.5% 97.5%
## 1.1830286 0.5780461 2.1285985
normal_endometrium_cpgtpg <- quantile(tab$C.T.at.CpG/tab$Age, c(0.5, 0.025, 0.975))/6
plot(tab$Age, tab$C.T.at.CpG/6, xlab="Age", ylab="CpG>TpG/Gb", pch=16)
Data for a single normal skin biopsy from Martincorena et al., Science 2015
foo <- read.table("../ref/PD20399be_wg_caveman_annotated_with_dinucs_for_mg14.txt", header=TRUE, sep="\t")
is_cpgtpg <- grepl("(A.CG[C,T])|(T.[A,G]CG)", paste(foo$mut,foo$trinuc_Ref))
normal_skin_cpgtpg <- sum(is_cpgtpg * foo$VAF)/0.375/55/6 # Adjust for YCG fraction, age and genome
normal_cpgtpg <- rbind(`Myeloid-MPN`=normal_hsc_cpgtpg,
`Skin-Melanoma`=c(normal_skin_cpgtpg,NA,NA) ,
`Uterus-AdenoCa`=normal_endometrium_cpgtpg,
`ColoRect-AdenoCa`=normal_colon_cpgtpg)
x <- abind::abind(cancer=ab[rownames(normal_cpgtpg), "b",colnames(normal_cpgtpg)], normal=normal_cpgtpg[,], along=3)
x["Skin-Melanoma",,] <-x["Skin-Melanoma",,]
Side-by-side comparison of the median rates of normal tissues with linear (ie presumed baseline) rates from cancer samples with 95% CI.
#pdf("cancer_normal_cpgtpg.pdf", 2.5,2.5, pointsize=8)
par(mar=c(3,3,1,1), bty="L", mgp=c(2,.5,0), tcl=-0.25, las=1)
t(barplot(t(x[,"50%",]), beside=TRUE, col=rep(tissueColors[rownames(normal_cpgtpg)], each=2), density=rep(c(NA,36), nrow(normal_cpgtpg)), ylim=c(0,max(x, na.rm=TRUE)),
ylab="Mutation rate [CpG>TpG/Gb/yr]", names.arg=c("Blood","Skin","Uterus","Colon"))) -> b
legend("topleft", c("Cancer","Normal"), fill="black", density=c(NA,36), bty="n")
#foo <- do.call("rbind", list(
# data.frame(cpgtpg=n_cpgtpg/59/6, x=b[1,2]),
# data.frame(cpgtpg=normal_skin_cpgtpg, x=b[2,2]),
# data.frame(cpgtpg=tab$C.T.at.CpG/tab$Age/6, x=b[3,2]),
# data.frame(cpgtpg=colon_sbs$CtoTatCpG/colon_sbs$age/6, x=b[4,2])))
#points(cpgtpg ~ jitter(x,.5), data=foo, cex=.5, pch=16, col="#00000044")
segments(b,x[,"2.5%",],b,x[,"97.5%",])
#dev.off()
xlsx output
ExtendedDataFigure8d <- createSheet(ExtendedDataFigure8, "ExtendedDataFigure8d")
addDataFrame(x, ExtendedDataFigure8d)
Now we time the latency of the most recent common ancestor (MRCA), or more accurately we compute the time between MRCA and the last observable subclone. An implicit assumption is that that the latency between the emergence of the last detectable subclone and diagnosis is short with respect to the time between fertilisation and diagnosis.
Acceleration values to simulate
accel <- c(1,2.5,5,7.5,10,20)
names(accel) <- paste0(accel, "x")
The actual timing using the age of each individual. This assumes that the branch length of the subclonal period is to be devided by the acceleration (ie mutation rate increase). We propagate Poisson error to obtain CIs.
set.seed(42)
d <- droplevels(donor2type[sample2donor[names(finalSnv)]])
computeSubclonesTimeAbs <- function(l, b) {
i <- d==l
tt0 <- b[i,]/cbind(finalPloidy[i], effGenome[i]) #/ cbind(nClones[i]-1, 1)
resB <- sapply(1:1000, function(foo){ ## Assess the impact of Poisson fluctuations on numbers
tt <- matrix(rpois(length(tt0), lambda=tt0), ncol=ncol(tt0))
res <- sapply(accel, function(a) tt[,1]/a/rowSums(tt/rep(c(a,1), each=nrow(tt)))) * age[sample2donor[names(finalSnv)[i]]]
colnames(res) <- paste0(accel, "x")
#res[res==0] <- NA
res}, simplify='array')
res <- sapply(accel, function(a) tt0[,1]/a/rowSums(tt0/rep(c(a,1), each=nrow(tt0)))) * age[sample2donor[names(finalSnv)[i]]]
colnames(res) <- paste0(accel, "x")
resCI <- apply(resB,1:2, quantile, c(0.1,0.9), na.rm=TRUE)
arr <- abind::abind(res, resCI, along=1)
rownames(arr)[1] <- "hat"
arr <- aperm(arr, c(2,1,3))
tt0[is.infinite(tt0)|is.nan(tt0)] <- 0
r <- which(rowSums(b[i,]) < 50 ) ## Exclude samples with less than 50 subs
arr[r,,] <- NA
return(arr)
}
Calculate for linear and branching topologies
subclonesTimeAbs <- mclapply(typesSubclones, computeSubclonesTimeAbs, b=branchDeam)
subclonesTimeAbsLinear <- mclapply(typesSubclones, computeSubclonesTimeAbs, b=branchDeamLinear)
names(subclonesTimeAbsLinear) <- names(subclonesTimeAbs) <- typesSubclones
Buest guess acceleration values (usually 5x, except Ovarian and CNS).
guessAccel <- sapply(subclonesTimeAbs, function(x) "5x")
guessAccel["Ovary-AdenoCa"] <- guessAccel["Liver-HCC"] <- "7.5x"
guessAccel[grep('CNS', names(guessAccel))] <- "2.5x"
Plot the inferred latencies per sample with overlaid boxplots.
u <- setdiff(names(finalSnv)[uniqueSamples], remove)
par( mar=c(7,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
qSubclone <- sapply(subclonesTimeAbs, function(x) apply(x[,"hat",][rownames(x)%in%u,,drop=FALSE], 2, quantile, c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE), simplify='array')
a <- "5x"
subclonesTimeAbsType <- sapply(names(subclonesTimeAbs), function(n) {x <- subclonesTimeAbs[[n]]; x[,,guessAccel[n]][setdiff(rownames(x),remove), 1:3, drop=FALSE]})
m <- diag(qSubclone["50%",guessAccel[dimnames(qSubclone)[[3]]],])#t[1,3,]
names(m) <- dimnames(qSubclone)[[3]]
nSubclones <- sapply(subclonesTimeAbsType, function(x) sum(!is.na(x[,1])))
m[nSubclones < 5] <- NA
o <- order(m, na.last=NA)
plot(NA,NA, xlim=c(0.5,length(m[o])), ylab="Years before diagnosis", xlab="", xaxt="n", yaxs="i", ylim=c(0,30))
abline(h=seq(10,20,10), col="#DDDDDD", lty=3)
x <- seq_along(m[o])
mg14::rotatedLabel(x, labels=names(sort(m)))
for(i in seq_along(o))try({
n <- names(m)[o[i]]
f <- function(x) x/max(abs(x))
a <- guessAccel[n]
bwd <- 0.8/2
j <- if(length(na.omit(subclonesTimeAbsType[[o[i]]][,"hat"]))>1) f(mg14::violinJitter(na.omit(subclonesTimeAbsType[[o[i]]][,"hat"]))$y)/4 + i else i
tpy <- 2
segments(j, na.omit(subclonesTimeAbsType[[o[i]]][,"90%"]), j, na.omit(subclonesTimeAbsType[[o[i]]][,"10%"]), col=mg14::colTrans(tissueLines[n],tpy))
points(j, na.omit(subclonesTimeAbsType[[o[i]]][,"hat"]), pch=21, col=mg14::colTrans(tissueBorder[n], tpy), bg=mg14::colTrans(tissueColors[n],tpy),
cex=tissueCex[n]*2/3, lwd=1)
rect(i-bwd,qSubclone["25%",a,n],i+bwd,qSubclone["75%",a,n], border=tissueLines[n], col=paste(tissueColors[n],"44", sep=""))
segments(i-bwd,qSubclone["50%",a,n],i+bwd,qSubclone["50%",a,n],col=tissueLines[n], lwd=2)
segments(i,qSubclone["75%",a,n],i,qSubclone["95%",a,n],col=tissueLines[n], lwd=1.5)
segments(i,qSubclone["5%",a,n],i,qSubclone["25%",a,n],col=tissueLines[n], lwd=1.5)
f <- function(x) x/max(abs(x))
})
#par(xpd=TRUE)
#s <- 12/8
#dev.copy2pdf(file="realTimeSubclone.pdf", width=6*s, height=3.5*3/5*s, pointsize=8*s)#' xlsx
xlsx output
ExtendedDataFigure9f <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9f")
tab <- data.frame(do.call("rbind", subclonesTimeAbsType))
tab$tumour_type <- donor2type[sample2donor[rownames(tab)]]
addDataFrame(tab, ExtendedDataFigure9f)
Alternative representation of median acceleration across the range of simulated rate increases, per cancer type:
#pdf("realTimeSubcloneMed.pdf", width=6, height=2.225, pointsize=8)
u <- setdiff(names(finalSnv)[uniqueSamples], remove)
m <- qSubclone["50%","5x",]#t[1,3,]
names(m) <- dimnames(qSubclone)[[3]]
m[nSubclones < 5] <- NA
o <- order(m, na.last=NA)
n <- dimnames(qSubclone)[[3]]
par( mar=c(7,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
plot(NA,NA, xlim=c(0.5,length(m[o])), ylab="Latency [yr]", xlab="", xaxt="n", yaxs="i", ylim=c(0,15))
abline(h=seq(10,20,10), col="#DDDDDD", lty=3)
x <- seq_along(m[o])
mg14::rotatedLabel(x, labels=names(sort(m)))
b <- .3
rect(seq_along(o)-b,qSubclone["50%","1x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=paste(tissueColors[n[o]],"88", sep=""), border=1)
rect(seq_along(o)-b,qSubclone["50%","2.5x",o],seq_along(o)+b,qSubclone["50%","7.5x",o], col=paste(tissueColors[n[o]],"FF", sep=""), border=1)
rect(seq_along(o)-b,qSubclone["50%","20x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=paste(tissueColors[n[o]],"22", sep=""), border=1)
segments(seq_along(o)-b,qSubclone["50%","5x",o],seq_along(o)+b,qSubclone["50%","5x",o])
#par(xpd=TRUE)
#s <- 12/8
#dev.copy2pdf(file="realTimeSubclone.pdf", width=6*s, height=3.5*3/5*s, pointsize=8*s)
sapply(subclonesTimeAbs, nrow)
## Biliary-AdenoCa Bladder-TCC Bone-Benign Bone-Epith Bone-Osteosarc Breast-AdenoCa
## 35 23 16 10 38 198
## CNS-GBM CNS-Medullo CNS-Oligo CNS-PiloAstro Cervix-SCC ColoRect-AdenoCa
## 41 146 18 89 18 60
## Eso-AdenoCa Head-SCC Kidney-CCRCC Kidney-ChRCC Kidney-PapRCC Liver-HCC
## 98 57 111 45 33 326
## Lung-AdenoCa Lung-SCC Lymph-BNHL Lymph-CLL Myeloid-AML Myeloid-MPN
## 38 48 107 95 11 55
## Ovary-AdenoCa Panc-AdenoCa Panc-Endocrine Prost-AdenoCa Skin-Melanoma SoftTissue-Leiomyo
## 113 241 85 286 107 15
## SoftTissue-Liposarc Stomach-AdenoCa Thy-AdenoCa Uterus-AdenoCa
## 19 75 48 51
xlsx output
Figure5c <- createSheet(Figure5, "Figure5c")
addDataFrame(aperm(qSubclone, c(3,1,2)), Figure5c)
Comparison of branching v linear
subclonesTimeAbsTypeLinear <- sapply(names(subclonesTimeAbsLinear), function(n) {x <- subclonesTimeAbsLinear[[n]]; x[,,guessAccel[n]][setdiff(rownames(x),remove), 1:3, drop=FALSE]})
qSubcloneLinear <- sapply(subclonesTimeAbsLinear, function(x) apply(x[,"hat",][rownames(x)%in%u,,drop=FALSE], 2, quantile, c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE), simplify='array')
n <- diag(qSubcloneLinear["50%",guessAccel[dimnames(qSubcloneLinear)[[3]]],])#t[1,3,]
names(n) <- dimnames(qSubcloneLinear)[[3]]
par( mar=c(5,3,3,10), mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="n", xpd=FALSE, las=1)
plot(c(rep(1, length(m)), rep(2, each=length(n))), c(m,n), bg=tissueColors[c(names(m), names(n))], pch=21, cex=1, xaxt="n", ylab="Years after MRCA", xlab="", xlim=c(0.5,2.5), ylim=c(0, max(n, na.rm=TRUE)))
segments(rep(1, each=length(m)), m, rep(2, each=length(n)), n,col=tissueLines[names(m)], lty= ifelse(sapply(subclonesTimeAbsType, nrow) <= 5, 3, tissueLty[names(m)]))
o <- order(n, na.last=NA)
y0 <- n[o]
y1 <- mg14:::mindist(n[o], diff(par('usr')[3:4])/30)
par(xpd=NA)
mtext(names(m)[o], at=y1, side=4 )
segments(2.1,y0,2.2,y0)
segments(2.2,y0,2.3,y1)
segments(2.3,y1,2.4,y1)
mg14::rotatedLabel(1:2, labels=c("Branching","Linear"))
xlsx output
ExtendedDataFigure9g <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9g")
addDataFrame(data.frame(branching=m, linear=n), ExtendedDataFigure9g)
Numbers per decade
yy <- do.call("rbind",subclonesTimeAbsType)
yy <- yy[setdiff(rownames(yy), remove),"hat"]
table(cut(yy, seq(0,60,10)))
##
## (0,10] (10,20] (20,30] (30,40] (40,50] (50,60]
## 1863 27 4 2 1 0
The other landmark event that can be timed in approximately 30% of samples are whole-genome doublings.
Calculate relative timing estimates based on deaminations. The following function calculates molecular timing estimates by only evaluating CpG>TpG mutations, separately for aggreagated 2:0, 2:1, and 2:2 regions in each sample.
computeWgdParamDeam <- function(vcf, bb, clusters, purity){
# 1. Find segments compatible with WGD
min.dist <- 0.05
m <- findMainCluster(bb)
l <- pmin(bb$time.lo, bb$time - min.dist)
u <- pmax(bb$time.up, bb$time + min.dist)
o <- which(l <= m & u >= m)
# 2. Find deaminations in compatible segments
w <- which(info(vcf)$MajCN==2 & sapply(info(vcf)$CNID, length)==1 & isDeamination(vcf) & vcf %over% bb[o])
if(donor2type[sample2donor[meta(header(vcf))$META["ID",]]]=="Skin-Melanoma")
w <- intersect(w, which(isDeaminationNoUV(vcf)))
v <- vcf[w]
if(nrow(v)<=100) return(NULL) # At least 100 SNVs
seqnames(rowRanges(v)) <- factor(3-info(v)$MinCN, levels=seqlevels(v))
v <- sort(v)
# 3. Merged CN segments
b <- GRanges(1:3, IRanges(rep(1,3),rep(max(end(v)),3)), copy_number=4:2, major_cn=2, minor_cn=2:0, clonal_frequency=as.numeric(purity))
# 4. Calculate times
l <- computeMutCn(v, b, clusters, purity, isWgd=TRUE, n.boot=200, rho=0.01, xmin=3)
b$n.snv_mnv <- l$n <- table(factor(info(v)$MinCN, levels=2:0))
l$time <- bbToTime(b, l$P)
return(l)
}
Takes ~1h.
wgdParamDeam <- mclapply(names(finalSnv)[isWgd], function(ID){
try(computeWgdParamDeam(finalSnv[[ID]], finalBB[[ID]], clusters=finalClusters[[ID]], purity=finalPurity[ID]))
})
names(wgdParamDeam) <- names(finalSnv)[isWgd]
Samples with insufficient data
void <- sapply(wgdParamDeam, function(x) is.null(x) | class(x)=="try-error")
Some checks
t <- sapply(wgdParamDeam[!void], function(x) {r <- as.matrix(x$time[,2:4]); rownames(r) <- x$time[,1];r}, simplify='array')
pairs(t(t[,"time",]))
Now transform the molecular timing estimates (effectively branch lengths, scaled to 1) accounting for possinle late mutation rate increase of up to 20x. This returns a large array.
wgdTimeDeamAcc <- simplify2array(mclapply(names(wgdParamDeam[!void]), function(n) {
x <- wgdParamDeam[!void][[n]]
T.clonal <- as.matrix(x$time[,2:4]) # Time of WGD as fraction of clonal
n.subclonal <- aggregate(x$D[,"pSub"], list(x$D[,"CNF"]), sum)
m <- match(n.subclonal$Group.1, finalClusters[[n]]$proportion)
p.subclonal <- x$power.c[m] # Power of subclones
b.subclonal <- n.subclonal$x %*% (n.subclonal$Group.1 / p.subclonal) / max(n.subclonal$Group.1) # Subclonal branch, power adjusted & 1/f-scaled
b.clonal <- sum(1-x$D[,"pSub"])/p.subclonal['1'] # Clonal branch (trunk), power adjusted & 1/f-scaled
f.subclonal <- b.subclonal / (b.subclonal + b.clonal)
G.clonal <- sum (1-x$D$pSub)/sum((1-x$D$pSub)*x$D$MutCN/(x$D$MajCN + x$D$MinCN)) # Effective ploidy clonal, adjusted for timing
G.subclonal <- sum(x$D$pSub*(x$D$MajCN + x$D$MinCN))/ sum (x$D$pSub) # Final ploidy
if(is.nan(G.subclonal)) G.subclonal <- mean(x$D$MajCN + x$D$MinCN)
ag <- age[sample2donor[names(finalBB)[isWgd][!void][j]]]
tmin <- max(0.5, 1-15/ag) # 15yrs or 50%, whatever smaller (median ~ 0.75 mutation time)
if(is.na(tmin)) tmin <- 0.8
ta=seq(tmin,1,l=20)
.correctAccel <- function(T.clonal, f.subclonal, G.clonal, G.subclonal, ta, a){ # Helper function to correct accel a at clonal time ta
t1 <- T.clonal + (1-T.clonal) *(a-1)/a*ta #acc before dup
t2 <- T.clonal * (ta + a*(1-ta)) ## after
T.clonal.adj <- pmin(t1, t2) # as fraction of clonal
a.clonal <- ta + (1-ta)*a # effective rate, avg over clonal
T.subclonal.abs <- f.subclonal / G.subclonal / a
T.clonal.abs <- (1-f.subclonal) / G.clonal/ a.clonal
T.clonal.abs <- T.clonal.abs / (T.clonal.abs + T.subclonal.abs) # as fraction of all mutations
return(c(T.WGD=T.clonal.adj * T.clonal.abs, T.MRCA=T.clonal.abs))
}
.correctAccelRand <- function(T.clonal, f.subclonal, G.clonal, G.subclonal, ta=seq(0.8,1,0.01), a=seq(1,10,1)){ # Helper to calculate range of accel a and times
sapply(ta, function(taa) sapply(a, function(aa) .correctAccel(T.clonal, f.subclonal, G.clonal, G.subclonal, taa, aa)), simplify='array')
}
res <- apply(T.clonal, 1:2, .correctAccelRand, f.subclonal, G.clonal, G.subclonal, a=accel, ta=seq(tmin,1,l=20))
dim(res) <- c(2, length(accel), length(ta), dim(res)[-1])
return(res)
}))
dimnames(wgdTimeDeamAcc)[1:2] <- list(c("T.WGD","T.MRCA"), names(accel))
dimnames(wgdTimeDeamAcc)[[5]] <- colnames(wgdParamDeam[[1]]$time)[2:4]
dimnames(wgdTimeDeamAcc)[[4]] <- levels(finalBB[[1]]$type)[c(3,1,2)]
dimnames(wgdTimeDeamAcc)[[6]] <- names(wgdParamDeam[!void])
n <- dimnames(wgdTimeDeamAcc)[[6]]
d <- droplevels(donor2type[sample2donor[n]])
s <- setdiff(levels(d), c(typeNa, names(which(table(d)<3))))
Lastly calculate the corresponding real time by scaling with age at diagnosis
f <- function(n, mu, a, b){ ## asymmetric normal to interpolate CIs of the timing estimates
r <- rnorm(n, sd=a)
w <- which(r>0)
r[w] <- r[w]*(b/a)[w]
return(r + mu)
}
wgdTimeAbs <- sapply(s, function(l) {
set.seed(42)
i <- d==l & ! n %in% c(rownames(purityPloidy)[purityPloidy$wgd_uncertain])
## absolute time by multiplying with age at diagnosis
absTimeSeg <- aperm((1-wgdTimeDeamAcc["T.WGD",,,,,i]) * rep(age[sample2donor[n]][i], each = prod(dim(wgdTimeDeamAcc)[c(2,3,4,5)])))
w <- t(sapply(wgdParamDeam[n[i]], `[[`, "n")) #number of SNV as weights
## remove NA due to zero mutations
for(c in 1:3)
absTimeSeg[,,c,,][is.na(absTimeSeg[,,c,,]) & w[,c]==0] <- 0
## weighted average over 2+0, 2+1 and 2+2 segments
absTime <- (absTimeSeg[,,1,,] * w[,1] + absTimeSeg[,,2,,] * w[,2] + absTimeSeg[,,3,,] * w[,3]) / rowSums(w)
rownames(absTime) <- n[i]
## Median over acceleration onset
absTimeMed <- apply(absTime, c(1,2,4), median, na.rm=TRUE)
colnames(absTimeMed) <- c("hat","up","lo")
## Simulate distribution sampling from timing onset and mutation time CI
ts <- sapply(1:1000, function(foo) {ax <- sample(1:20,1); matrix(f(length(absTime[,"time",ax,]),a=abs(absTime[,"time",ax,] - absTime[,"time.up",ax,])/2, b=abs(absTime[,"time.lo",ax,]-absTime[,"time",ax,])/2, mu=absTime[,"time",ax,]), nrow=dim(absTime)[1])}, simplify='array')
me <- apply(ts, 1:2, quantile, c(0.1, 0.8), na.rm=TRUE) # 80% CIs
absTimeMed[,"lo",] <- (me[1,,])
absTimeMed[,"up",] <- (me[2,,])
absTimeMed[,c(1,3,2),]
}, simplify=FALSE)
First plot the inferred latency of WGD for each sample, per cancer type, overlaid by boxplots, using buest-guess acceleration.
par( mar=c(7,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
u <- setdiff(names(finalSnv)[uniqueSamples], remove)
qWgd <- sapply(wgdTimeAbs, function(x) apply(x[rownames(x) %in% u,"hat",], 2, quantile, c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE), simplify='array')
nWgd <- sapply(wgdTimeAbs, function(x) sum(x[rownames(x) %in% u,"hat","1x"]!=0, na.rm=TRUE))
wgdTimeAbsType <- sapply(names(wgdTimeAbs), function(n) {x <- wgdTimeAbs[[n]]; x[,,guessAccel[n]][setdiff(rownames(x),remove), 1:3, drop=FALSE]})
m <- diag(qWgd["75%",guessAccel[dimnames(qWgd)[[3]]],])#t[1,3,]
names(m) <- dimnames(qWgd)[[3]]
o <- order(m, na.last=NA)
x <- seq_along(m[o])
plot(NA,NA, xlim=c(0.5,length(m[o])), ylim=c(0,max(do.call('rbind',wgdTimeAbsType)[,1], na.rm=TRUE)+5), ylab="Years before diagnosis", xlab="", xaxt="n", yaxs="i")
abline(h=seq(10,60,10), col="#DDDDDD", lty=3)
mg14::rotatedLabel(x, labels=names(sort(m)))
for(i in seq_along(o)){
n <- names(m)[o[i]]
f <- function(x) x/max(abs(x))
a <- guessAccel[n]
j <- f(mg14::violinJitter(na.omit(wgdTimeAbsType[[o[i]]][,"hat"]))$y)/4 + i #rank(na.omit(tWgdByType[[o[i]]][,"hat"]))/2/length(na.omit(tWgdByType[[o[i]]][,"hat"]))-0.25+i #
tpy <- if(grepl("Skin|Lung", n)) 4 else 2
segments(j, na.omit(wgdTimeAbsType[[o[i]]][,"up"]), j, na.omit(wgdTimeAbsType[[o[i]]][,"lo"]), col=mg14::colTrans(tissueLines[n],tpy), lty=tissueLty[n])
points(j, na.omit(wgdTimeAbsType[[o[i]]][,"hat"]), pch=21, col=mg14::colTrans(tissueBorder[n], tpy), bg=mg14::colTrans(tissueColors[n],tpy),
cex=tissueCex[n]*2/3, lwd=1)
bwd <- 0.8/2
rect(i-bwd,qWgd["25%",a,n],i+bwd,qWgd["75%",a,n], border=tissueLines[n], col=paste0(tissueColors[n],"44"))
segments(i-bwd,qWgd["50%",a,n],i+bwd,qWgd["50%",a,n],col=tissueLines[n], lwd=2)
segments(i,qWgd["75%",a,n],i,qWgd["95%",a,n],col=tissueLines[n], lwd=1.5)
segments(i,qWgd["5%",a,n],i,qWgd["25%",a,n],col=tissueLines[n], lwd=1.5)
}
par(xpd=TRUE)
#s <- 12/8
#dev.copy2pdf(file="realTimeWgd.pdf", width=4*s, height=3.5*s, pointsize=8*s)
xlsx output
ExtendedDataFigure9a <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9a")
tab <- data.frame(do.call("rbind", wgdTimeAbsType))
tab$tumour_type <- donor2type[sample2donor[rownames(tab)]]
addDataFrame(tab, ExtendedDataFigure9a)
Plot the median timing value for WGD, as a function of the unknown rate increase.
#pdf("realTimeWgdMed.pdf", width=4.5, height=3, pointsize=8)
u <- setdiff(names(finalSnv)[uniqueSamples], remove)
m <- qWgd["50%","5x",]#t[1,3,]
names(m) <- dimnames(qWgd)[[3]]
m[nWgd < 5] <- NA
o <- order(m, na.last=NA)
n <- dimnames(qWgd)[[3]]
par( mar=c(7,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
plot(NA,NA, xlim=c(0.5,length(m[o])), ylab="Latency [yr]", xlab="", xaxt="n", yaxs="i", ylim=c(0,40))
abline(h=seq(10,20,10), col="#DDDDDD", lty=3)
x <- seq_along(m[o])
mg14::rotatedLabel(x, labels=names(sort(m)))
b <- .3
rect(seq_along(o)-b,qWgd["50%","1x",o],seq_along(o)+b,qWgd["50%","10x",o], col=paste(tissueColors[n[o]],"88", sep=""), border=1)
rect(seq_along(o)-b,qWgd["50%","2.5x",o],seq_along(o)+b,qWgd["50%","7.5x",o], col=paste(tissueColors[n[o]],"FF", sep=""), border=1)
rect(seq_along(o)-b,qWgd["50%","20x",o],seq_along(o)+b,qWgd["50%","10x",o], col=paste(tissueColors[n[o]],"22", sep=""), border=1)
segments(seq_along(o)-b,qWgd["50%","5x",o],seq_along(o)+b,qWgd["50%","5x",o])
xlsx output
Figure5b <- createSheet(Figure5, "Figure5b")
addDataFrame(aperm(qWgd, c(3,1,2)), Figure5b)
As a quality control, plot a range of extremely early samples
t <- do.call("rbind", wgdTimeAbsType)
o <- order(t[,"hat"], na.last=NA)
for(n in rownames(t)[tail(o, 20)])
plotSample(n, title=paste0(sub("-.+","",n),", ", donor2type[sample2donor[n]], ", ",round(t[n,"hat"]),"yr"))
Numbers per decade
yy <- do.call("rbind",wgdTimeAbsType)
yy <- yy[setdiff(rownames(yy), remove),"hat"]
table(cut(yy, seq(0,60,10)))
##
## (0,10] (10,20] (20,30] (30,40] (40,50] (50,60]
## 389 86 47 20 11 9
A panel of scatter plots showing WGD time as a function of age at diagnosis.
par(mfrow=c(6,6), mar=c(3,3,2,1),mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="L", xpd=FALSE, las=1)
for(i in seq_along(wgdTimeAbsType)){
n <- names(wgdTimeAbsType)[i]
y <- wgdTimeAbsType[[n]][,"hat"]
x <- age[sample2donor[names(y)]]
plot(x,x-y, pch=NA, bg=tissueColors[n], col=tissueBorder[n], xlab="Age [yr]", ylab="WGD [yr]", cex=tissueCex[n], xlim=c(0,90), ylim=c(0,90))
segments(x, y0=x-wgdTimeAbsType[[n]][,"up"],y1=x-wgdTimeAbsType[[n]][,"lo"],col=tissueLines[n])
points(x,x-y, pch=21, bg=tissueColors[n], col=tissueBorder[n], cex=tissueCex[n])
# d <- density(na.omit(x), bw="SJ", from=0)
# lines(d$x,d$y*100,col=tissueLines[n], lty=tissueLty[n])
# d <- density(na.omit(x-y), bw="SJ", from=0)
# lines(d$y*100, d$x,col=tissueLines[n], lty=tissueLty[n])
rug(x, col=tissueLines[n],)
rug(x-y, side=2, col=tissueLines[n])
title(main=n, line=0, font.main=1, cex.main=1)
abline(0,1, lty=3)
}
xlsx output
ExtendedDataFigure9b <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9b")
tab <- data.frame(do.call("rbind", wgdTimeAbsType))
tab$age <- age[sample2donor[rownames(tab)]]
tab$tumour_type <- donor2type[sample2donor[rownames(tab)]]
addDataFrame(tab, ExtendedDataFigure9b)
For a rough comparison take the average co-amplification molecular time estimate calculated above against the estimate based on CpG>TpG mutations.
t <- sapply(wgdParamDeam[!void], function(x) x$time$time)
plot(timingInfo[colnames(t),'timeCoamp'],
colMeans(t),
cex=sqrt(nSub[colnames(t)]/10000),
pch=21,bg=tissueColors[donor2type[sample2donor[colnames(t)]]],
col=tissueBorder[donor2type[sample2donor[colnames(t)]]],
xlab="time [all mutations]",
ylab="time [CpG>TpG]")
abline(0,1, lty=3)
Plot difference as boxplots:
par(mar=c(6,4,1,1), xaxs='i')
boxplot(timingInfo[colnames(t),'timeCoamp'] - colMeans(t) ~ droplevels(donor2type[sample2donor[colnames(t)]]),
col=tissueColors[levels(droplevels(donor2type[sample2donor[colnames(t)]]))],
na.rm=TRUE, las=2,
lty=1,
staplewex=0,
pch=16,
cex=0.8,
ylab='time [all] - time [CpG>TpG]',
names=NA
)
mg14::rotatedLabel(labels=levels(droplevels(donor2type[sample2donor[colnames(t)]])))
abline(h=0, lty=3)
To gain further insighe assess the distribution of the actual mutation numbers used for WGD timing across (2:0, 2:1, 2:2) regions
nDeam <- sapply(wgdParamDeam[!void], function(x) if(!is.null(x$n)) sum(x$n, na.rm=TRUE) else NA)
Number of deaminations only in 2:2 regions
nDeam22 <- sapply(wgdParamDeam[!void], function(x) if(!is.null(x$n)) as.numeric(x$n[1]) else NA)
w22 <- sapply(finalBB[isWgd][!void], function(bb) {
w <- bb$major_cn==2 & bb$minor_cn==2 & !duplicated(bb)
sum(as.numeric(width(bb)[w]), na.rm=TRUE)})
nDeam22 <- nDeam22/w22*1e9
Fraction of deam on 1 and 2 copies
d <- nDeam22 * t(sapply(wgdParamDeam[!void], function(x) if(!is.null(x$P)) x$P[[1]][1:2,"P.m.sX"] else c(NA,NA)))
d[rownames(d) %in% remove] <- NA
Unadjusted time (inc. of subclonal mutations)
t0 <- colMeans(wgdTimeDeamAcc["T.WGD","1x",1,,"time",],na.rm=TRUE) # WGD
names(t0) <- dimnames(wgdTimeDeamAcc)[[6]]
m0 <- colMeans(wgdTimeDeamAcc["T.MRCA","1x",1,,"time",],na.rm=TRUE) # MRCA
names(m0) <- dimnames(wgdTimeDeamAcc)[[6]]
Conceptual plot
#pdf("chronRateIncrease.pdf",2,2,pointsize=8)
.par()
a <- c(1,2.5,5,7.5,10, 20)
x <- age[sample2donor["052665d1-ab75-4f40-be5a-b88154c8beed"]]
y <- nDeam["052665d1-ab75-4f40-be5a-b88154c8beed"]
t <- seq(0,x, by=0.1)
f <- function(t, ta, a){y <- t; y[t>ta] <- y[t>ta] + (y[t>ta] - min(y[t>ta]))*(a-1); y <- y/max(y)}
r <- sapply(a, function(aa){
apply(sapply(x - seq(15,0,l=100), function(ta) f(t, ta, aa)),1,mean)
})
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
## Warning in min(y[t > ta]): no non-missing arguments to min; returning Inf
plot(NA, NA, xlim=c(0,x), ylim=c(0,y), xlab="Age", ylab="CpG>TpG mutations")
for(i in seq_along(a)) lines(t, r[,i]*y)
w <- t0["052665d1-ab75-4f40-be5a-b88154c8beed"]
v <- m0["052665d1-ab75-4f40-be5a-b88154c8beed"]
segments(0,0, 0,w*y, col=set1[3])
text(10, w*y, "WGD", pos=3)
segments(0,w*y, 0, v*y, col=set1[4], lwd=2) # (WGD, MRCA)
text(0, v*y, "MRCA", pos=4)
segments(0,v*y, 0, y, col=set1[1], lwd=3) # (MRCA, diag)
for(i in seq_along(a)){
x0 = t[which.min(abs(w - r[,i]))]
segments(x0,0,x0,w*y, lty=3)
}
segments(0,w*y, x0, w*y, lty=3)
#dev.off()
Plot time v early and total number of mutations
y <- d[names(t0),]/6
x <- t0
t <- donor2type[sample2donor[names(t0)]]
plot(x,y[,2], bg=tissueColors[t], pch=21, col=tissueBorder[t], cex=tissueCex[t]*1, lwd=0.5, xlab="Time", ylab="Early SNVs/Gb", log='')
p <- predict(loess(y[,2]~x, span=1), newdata=sort(x, na.last=NA), se=TRUE)
r <- function(x) c(x, rev(x))
polygon(r(sort(x, na.last=NA)), c(p$fit+2*p$se, rev(p$fit-2*p$se)), col="#00000044", border=NA)
lines(sort(x, na.last=NA),p$fit)
plot(x,y[,1]+y[,2], bg=tissueColors[t], pch=21, col=tissueBorder[t], cex=tissueCex[t]*1, lwd=0.5, xlab="Time", ylab="Total SNVs/Gb", log='')
p <- predict(loess(rowSums(y)~x, span=1), newdata=sort(x, na.last=NA), se=TRUE)
r <- function(x) c(x, rev(x))
polygon(r(sort(x, na.last=NA)), c(p$fit+2*p$se, rev(p$fit-2*p$se)), col="#00000044", border=NA)
lines(sort(x, na.last=NA),p$fit)
#s <- 12/8; dev.copy2pdf(file="nDeam22Time.pdf", width=2*s, height=2*s, pointsize=8*s)
Reassuringly, the molecular timing estimate appears independent of the total mutation count and largely driven by a depletion of early variants (rather than an excess of late variants).
xlsx outputs
ExtendedDataFigure9c <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9c")
tab <- data.frame(`Molecular Time`=x, `Early CpG>TpG/Gb`=y[,2])
addDataFrame(tab, ExtendedDataFigure9c)
ExtendedDataFigure9d <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9d")
tab <- data.frame(`Molecular Time`=x, `Total CpG>TpG/Gb`=rowSums(y))
addDataFrame(tab, ExtendedDataFigure9d)
So far we used a fixed rate increase across samples. If some samples experienced a higher increase, this should manifest as a relative excess of mutations. Hence, calculate timing estimates using greater rate increase for samples with a higher CpG>TpG burden (per cancer type).
accelRelWgd <- sapply(names(wgdTimeAbs), function(n) {
r <- rateDeam[[n]]
x <- wgdTimeAbs[[n]]
r0 <- quantile(r[!names(r) %in% remove], 0.20, na.rm=TRUE)
a <- cut(pmin(pmax(1,(r/r0-0.9)/0.1),10), c(0,1.5,3.75,6.25,8.75,20), labels=c("1x","2.5x","5x","7.5x","10x"))
print(table(a))
names(a) <- names(r)
ta <- sapply(rownames(x), function(ss) x[ss, "hat",a[ss]])
ta
})
## a
## 1x 2.5x 5x 7.5x 10x
## 10 5 3 1 16
## a
## 1x 2.5x 5x 7.5x 10x
## 7 4 0 5 7
## a
## 1x 2.5x 5x 7.5x 10x
## 8 5 3 2 19
## a
## 1x 2.5x 5x 7.5x 10x
## 39 27 52 26 40
## a
## 1x 2.5x 5x 7.5x 10x
## 10 9 10 2 10
## a
## 1x 2.5x 5x 7.5x 10x
## 35 17 21 9 54
## a
## 1x 2.5x 5x 7.5x 10x
## 5 5 3 4 1
## a
## 1x 2.5x 5x 7.5x 10x
## 12 10 10 5 23
## a
## 1x 2.5x 5x 7.5x 10x
## 26 18 10 13 30
## a
## 1x 2.5x 5x 7.5x 10x
## 17 4 9 5 22
## a
## 1x 2.5x 5x 7.5x 10x
## 30 26 24 15 16
## a
## 1x 2.5x 5x 7.5x 10x
## 9 3 4 5 24
## a
## 1x 2.5x 5x 7.5x 10x
## 10 4 7 4 8
## a
## 1x 2.5x 5x 7.5x 10x
## 87 117 75 23 24
## a
## 1x 2.5x 5x 7.5x 10x
## 8 3 2 3 19
## a
## 1x 2.5x 5x 7.5x 10x
## 11 9 15 7 6
## a
## 1x 2.5x 5x 7.5x 10x
## 25 14 13 11 44
## a
## 1x 2.5x 5x 7.5x 10x
## 28 38 29 11 7
## a
## 1x 2.5x 5x 7.5x 10x
## 55 55 50 28 50
## a
## 1x 2.5x 5x 7.5x 10x
## 22 3 3 8 49
## a
## 1x 2.5x 5x 7.5x 10x
## 78 40 39 38 89
## a
## 1x 2.5x 5x 7.5x 10x
## 24 16 21 9 37
## a
## 1x 2.5x 5x 7.5x 10x
## 4 4 2 2 3
## a
## 1x 2.5x 5x 7.5x 10x
## 5 6 4 1 3
## a
## 1x 2.5x 5x 7.5x 10x
## 23 5 7 5 34
## a
## 1x 2.5x 5x 7.5x 10x
## 12 13 7 5 14
par(mar=c(3,3,1,1),mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="L", xpd=FALSE, las=1)
x <- Reduce("c",accelRelWgd)
y <- Reduce("c",sapply(wgdTimeAbs, function(x) x[,"hat","5x"]))
t <- donor2type[sample2donor[names(x)]]
plot(y+runif(length(y)),x+runif(length(y)), pch=21, bg=tissueColors[t], col=tissueBorder[t], cex=tissueCex[t]*1, lwd=0.5, xlab="Time (constant acceleration)", ylab="Time (sample-specific accel.)", log='')
#s <- 12/8; dev.copy2pdf(file="accelRelWgd.pdf", width=2*s, height=2*s, pointsize=8*s)
Compare quartiles of fixed and sample-specific timing approaches:
par(mar=c(3,3,1,1),mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="L", xpd=FALSE, las=1)
u <- setdiff(names(finalSnv)[uniqueSamples], remove)
qAccelRelWgd <- sapply(accelRelWgd, function(x){
quantile(x[names(x) %in% u], na.rm=TRUE)
})
t <- colnames(qAccelRelWgd)
plot(qWgd["50%","5x",], qAccelRelWgd["50%",], pch=NA, xlab="Time [years], constant acceleration", ylab="Time [years], sample-specific accel.", xlim=c(0,30), ylim=c(0,30))
abline(0,1, lty=3)
segments(qWgd["25%","5x",], qAccelRelWgd["50%",],qWgd["75%","5x",], qAccelRelWgd["50%",], lty=tissueLty[t], col=tissueLines[t])
segments(qWgd["50%","5x",], qAccelRelWgd["25%",],qWgd["50%","5x",], qAccelRelWgd["75%",], lty=tissueLty[t], col=tissueLines[t])
points(qWgd["50%","5x",], qAccelRelWgd["50%",], bg=tissueColors[t], pch=21, col=tissueBorder[t], cex=tissueCex[t], lwd=0.5)
#s <- 12/8; dev.copy2pdf(file="qAccelRelWgd.pdf", width=2*s, height=2*s, pointsize=8*s)
xlsx output
ExtendedDataFigure9e <- createSheet(ExtendedDataFigure9, "ExtendedDataFigure9e")
tab <- data.frame(fixed_acc=t(qWgd[2:4,"5x",]),sample_acc=t(qAccelRelWgd[2:4,]))
addDataFrame(tab, ExtendedDataFigure9e)
Mutations per year vs time. If the mutation rate was constant, there should be a proportional increase due to the double opportunity to mutate after WGD.
par(mfrow=c(6,6), mar=c(3,3,2,1),mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
for(n in names(wgdTimeAbsType)){
a <- age[sample2donor[rownames(wgdTimeAbsType[[n]])]]
yy <- nDeam22[rownames(wgdTimeAbsType[[n]])]/a
xx <- 1-t0[rownames(wgdTimeAbsType[[n]])]#y[[n]][,"hat"]/a
try({
l <- lm(yy ~ xx)
x0 <- -l$coef[2]/l$coef[1]
#print(x0)
plot(xx, yy, bg=tissueColors[n], col=tissueBorder[n],, pch=21, log='', xlab="1-time [mutations]", ylab="SNVs/Gb/yr", main=n, ylim=c(0,max(yy, na.rm=TRUE)), xlim=c(0,max(xx, na.rm=TRUE)), cex.main=1)
#abline(l, lty=3)
#abline(l$coef[1], l$coef[1])
m <- median(yy/(1+xx),na.rm=TRUE)
abline(m, m, lty=3)
#lines(c(x0,2*x0), c(0,1))
})
}
Compare timing estimates of MRCA and WGD. First, scatter of median
par( mar=c(4,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
a <- "5x"
plot(qSubclone["50%",a,dimnames(qWgd)[[3]]], qWgd["50%",a,], col=tissueColors[dimnames(qWgd)[[3]]], pch=16, cex=2, xlab="Median time subclones", ylab="Median time WGD", xlim=c(0,5), ylim=c(0,10))
A dotplot may be more informative
par( mar=c(3,3,3,10), mgp=c(2,.5,0), tcl=-0.25,cex=1, bty="n", xpd=FALSE, las=1)
w <- "50%"
x <- diag(qSubclone[w,guessAccel[dimnames(qSubclone)[[3]]],])#t[1,3,]
names(x) <- dimnames(qSubclone)[[3]]
y <- diag(qWgd[w,guessAccel[dimnames(qWgd)[[3]]],])#t[1,3,]
names(y) <- dimnames(qWgd)[[3]]
plot(c(rep(1, dim(qSubclone)[3]), rep(2, each=dim(qWgd)[3])), c(x,y), bg=tissueColors[c(dimnames(qSubclone)[[3]], dimnames(qWgd)[[3]])], pch=21, cex=1, xaxt="n", ylab="Years before diagnosis", xlab="", xlim=c(0.5,2.5), ylim=c(0, max(y, na.rm=TRUE)))
segments(rep(1, each=dim(qWgd)[3]), x[dimnames(qWgd)[[3]]], rep(2, each=dim(qWgd)[3]), y,col=tissueLines[dimnames(qWgd)[[3]]], lty= ifelse(nWgd <= 5, 3, tissueLty[dimnames(qWgd)[[3]]]))
o <- order(y, na.last=NA)
y0 <- y[o]
y1 <- mg14:::mindist(y[o], diff(par('usr')[3:4])/30)
par(xpd=NA)
mtext(dimnames(qWgd)[[3]][o], at=y1, side=4 )
segments(2.1,y0,2.2,y0)
segments(2.2,y0,2.3,y1)
segments(2.3,y1,2.4,y1)
mg14::rotatedLabel(1:2, labels=c("Subclones","WGD"))
#segments(2.1,y0,2.3,y1)
#par(xpd=TRUE)
#s <- 12/8
#dev.copy2pdf(file="realTimeSubcloneWgd.pdf", width=2.5*s, height=3.5*s, pointsize=8*s)
Lastly demonstrate concordance of our timing inferences based on multi-sample cases.
donor2sample <- split(names(finalBB),sample2donor[names(finalBB)])
donor2sample <- donor2sample[sapply(donor2sample, length)>1]
WGD - samples with * removed
par(mfrow=c(2,3), mar=c(4,3,2,1), cex=1)
for(n in names(donor2sample)){
t <- as.character(donor2type[n])
try({
s <- wgdTimeAbs[[t]][,,guessAccel[t]]
s <- s[rownames(s) %in% donor2sample[[n]],,drop=FALSE]
if(length(s) ==0 ) next
if(nrow(s) ==1) next
plot(s[,"hat"], ylim=c(0, 30), main = paste(n, t), ylab="Time WGD [yr]", xlab="", xaxt="n", xlim=c(0,nrow(s)+1), pch=NA)
mg14::rotatedLabel(labels=paste0(sample2icgc[rownames(s)],ifelse(rownames(s) %in% remove, "*","")))
segments(x0=1:nrow(s),y0=s[,'lo'], y1=s[,'up'], col=tissueLines[t],lty=tissueLty[t])
points(s[,"hat"], pch=21, bg=tissueColors[t], cex=tissueCex[t], col=tissueBorder[t])
abline(h= mean(s[,'hat']), col=tissueLines[t], lty=3)
})
}
MRCA - samples with * removed
par(mfrow=c(8,8), mar=c(4,3,2,1), cex=1)
for(n in names(donor2sample)){
t <- as.character(donor2type[n])
try({
s <- subclonesTimeAbs[[t]][,,guessAccel[t]]
s <- s[rownames(s) %in% donor2sample[[n]],,drop=FALSE]
if(length(s) ==0 ) next
if(nrow(s) ==1) next
plot(s[,"hat"], ylim=c(0, 20), main = paste(n, t), ylab="Time WGD [yr]", xlab="", xaxt="n", xlim=c(0,nrow(s)+1), pch=NA)
mg14::rotatedLabel(labels=paste0(sample2icgc[rownames(s)],ifelse(rownames(s) %in% remove, "*","")))
par(xpd=NA)
segments(x0=1:nrow(s),y0=s[,'10%'], y1=s[,'90%'], col=tissueLines[t],lty=tissueLty[t])
points(s[,"hat"], pch=21, bg=tissueColors[t], cex=tissueCex[t], col=tissueBorder[t])
par(xpd=TRUE)
abline(h= mean(s[,'hat']), col=tissueLines[t], lty=3)
})
}
In this section we analyse changes of mutation spectra across the clonal timing categories.
Caclulate trinucleotide substitution spectra across all samples and all timing categories.
tncTime <- simplify2array(mclapply(finalSnv, function(vcf) table(tncToPyrimidine(vcf), info(vcf)$CLS)))
Calculate cosine distance between clonal and subclonal, plot 10 samples with strongest change and 1000 SNVs each
w <- which(apply(mg14:::asum(tncTime[,c(1,2),], 1)>1000, 2, all))
dEarlyLate <- sapply(w, function(i) {x <- tncTime[,,i]; 1-cosineDist(x[,1,drop=FALSE],x[,2,drop=FALSE])})
makeTitle <- function(n){
d <- sample2donor[n]
paste0(sample2icgc[n], ", ", donor2type[d], ", ", age[d], "yr")
}
As examples show the 10 samples with the most striking changes.
n <- names(tail(sort(dEarlyLate),10))
sigCol <- sapply(c("#2196F3","#212121","#f44336","#BDBDBD","#8BC34A","#FFAB91"), rep,16)
ttl <- function(){
l <- gsub("\\[|\\]||>.","", dimnames(tncTime)[[1]])
mtext(text=l, at=b, las=2, cex=0.25, side=1)
mtext(at=b[8+16*(0:5)], side=1,text=c("C>A","C>G","C>T","T>A","T>C","T>G"), las=1, line=1)
}
for(s in n){
.par()
par(mfrow=c(2,1), las=2, mar=c(2,3,2,1))
b <- barplot(as.numeric(tncTime[,1,s]), ylab="SNVs", main=makeTitle(s),col=sigCol,border=NA, font.main=1, las=2, cex.main=1)
text(x=b[1],y=par("usr")[4], labels="early clonal", xpd=TRUE, adj=c(0,1))
ttl()
barplot(as.numeric(tncTime[,2,s]), ylab="SNVs", col=sigCol, border=NA,las=2)
text(x=b[1],y=par("usr")[4], labels="late clonal", xpd=TRUE, adj=c(0,1))
ttl()
}
xlsx output
Figure4a <- createSheet(Figure4, "Figure4a")
addDataFrame(as.data.frame(tncTime[,1:2,names(which(sample2icgc[n]=="SA434348"))]), Figure4a)
ExtendedDataFigure6a <- createSheet(ExtendedDataFigure6, "ExtendedDataFigure6a")
x <- tncTime[,1:2,n]
dimnames(x)[[3]] <- sample2icgc[n]
addDataFrame(as.data.frame(x), ExtendedDataFigure6a)
To formatlise what constitutes a measureable change, calculate an LRT
tncLrt <- function(x){
l1 <- dmultinom(x=x[,1], prob=x[,1], log=TRUE) + dmultinom(x=x[,2], prob=x[,2], log=TRUE)
l0 <- dmultinom(x=x[,1], prob=x[,1]+x[,2], log=TRUE) + dmultinom(x=x[,2], prob=x[,1]+x[,2], log=TRUE)
chisq <- 2*(l1-l0)
df <- nrow(x)-1
return(c(chisq=chisq, df=df, p=pchisq(chisq, df=df, lower.tail=FALSE)))
}
w <- which(colSums(tncTime[,1,])>0 & colSums(tncTime[,2,])>0) # Samples with at mutations in both categories
pEarlyLate <- apply(tncTime[,1:2,w],3, tncLrt)
table(p.adjust(pEarlyLate["p",], "bonf")<0.05)
##
## FALSE TRUE
## 1327 525
Refit some mutational signatures, which have apparently been missed in the time-agnostic signature attribution
sbs <- read.csv("../ref/sigProfiler_SBS_signatures_2018_03_28.csv") # Official table
n <- names(which(sample2icgc=="SA434348"))
e <- nmSolve(tncTime[,,n], P = as.matrix(sbs[,grep("(SBS7[a-d])|SBS38|SBS40", colnames(sbs))]), maxIter=10000, tol=1e-6)
## 3333333333333456
a <- rbind(UV=colSums(e[1:4,]),e[5:6,])[,1:3]
apply(t(t(a)/colSums(a))*100,2,mg14:::roundProp)
## clonal [early] clonal [late] clonal [NA]
## UV 89 23 72
## SBS38 2 37 11
## SBS40 9 40 17
n <- names(which(sample2icgc=="SA309724"))
e <- nmSolve(tncTime[,,n], P = as.matrix(sbs[,grep("SBS(2|13|4|5)$", colnames(sbs))]), maxIter=10000)
a <- rbind(APOBEC=colSums(e[c(1,4),]), e[2:3,])
apply(t(t(a)/colSums(a))*100,2,mg14:::roundProp)
## clonal [early] clonal [late] clonal [NA] subclonal
## APOBEC 10 71 29 76
## SBS4 55 7 39 9
## SBS5 35 22 32 15
n <- names(which(sample2icgc=="SA100712"))
e <- nmSolve(tncTime[,,n], P = as.matrix(sbs[,grep("SBS(1|17.|18|5|40)$", colnames(sbs))]), maxIter=10000)
a <- rbind(`17`=colSums(e[c(3:4),]), e[-(3:4),])
apply(t(t(a)/colSums(a))*100,2,mg14:::roundProp)
## clonal [early] clonal [late] clonal [NA] subclonal
## 17 1 32 26 29
## SBS1 26 7 11 3
## SBS5 24 12 13 1
## SBS18 35 10 14 17
## SBS40 14 39 36 50
Calculate cosine distance between clonal and subclonal, plot 10 samples with strongest change and 1000 SNVs each
w <- which(mg14:::asum(tncTime[,1:3,], c(1,2))>1000 & mg14:::asum(tncTime[,4,], 1)>1000)
dClonalSubclonal <- sapply(w, function(i) {x <- tncTime[,,i]; 1-cosineDist(x[,1:3] %*% rep(1,3),x[,4,drop=FALSE])})
n <- names(tail(sort(dClonalSubclonal),10))
for(s in n){
.par()
par(mfrow=c(2,1), las=2, mar=c(2,3,2,1))
b <- barplot(as.numeric(rowSums(tncTime[,1:3,s])), ylab="SNVs", main=makeTitle(s),col=sigCol,border=NA, font.main=1, las=2, cex.main=1)
text(x=b[1],y=par("usr")[4], labels="clonal", xpd=TRUE, adj=c(0,1))
ttl()
barplot(as.numeric(tncTime[,4,s]), ylab="SNVs", col=sigCol, border=NA,las=2)
text(x=b[1],y=par("usr")[4], labels="subclonal", xpd=TRUE, adj=c(0,1))
ttl()
}
xlsx output
x <- tncTime[,c(1,4),n]
x[,1,] <- mg14:::asum(tncTime[,1:3,n], 2)
dimnames(x)[[3]] <- sample2icgc[n]
dimnames(x)[[2]][1] <- "clonal [early/late/NA]"
Figure4b <- createSheet(Figure4, "Figure4b")
addDataFrame(as.data.frame(x[,1:2,"SA557034"]), Figure4b)
ExtendedDataFigure6b <- createSheet(ExtendedDataFigure6, "ExtendedDataFigure6b")
addDataFrame(as.data.frame(x), ExtendedDataFigure6b)
TNC LRT
w <- which(mg14:::asum(tncTime[,1:3,], c(1,2))>0 & mg14:::asum(tncTime[,4,], 1)>0)
pClonalSubclonal <- apply(tncTime[,,w],3, function(x) tncLrt(x %*% matrix(c(1,1,1,0,0,0,0,1), ncol=2)))
table(p.adjust(pClonalSubclonal["p",], "bonf")<0.05)
##
## FALSE TRUE
## 1677 710
Mean absolute differences
n <- names(which(p.adjust(pEarlyLate["p",], "bonf")<0.05))
madEarlyLate <- rowSums(abs(t(tncTime[,1,n])/colSums(tncTime[,1,n]) - t(tncTime[,2,n])/colSums(tncTime[,2,n])))/2
n <- names(which(p.adjust(pClonalSubclonal["p",], "bonf")<0.05))
madClonalSubclonal <- rowSums(abs(t(mg14:::asum(tncTime[,1:3,n],2))/colSums(mg14:::asum(tncTime[,1:3,n],2)) - t(tncTime[,4,n])/colSums(tncTime[,4,n])))/2
More signature changes
n <- names(which(sample2icgc=="SA557034"))
e <- nmSolve(tncTime[,,n], P = as.matrix(sbs[,grep("SBS(1|2|13|3|5)$", colnames(sbs))]), maxIter=10000)
e <- cbind(clonal=rowSums(e[,1:3]), subclonal=e[,4])
a <- rbind(`APOBEC`=colSums(e[c(2,5),]), e[-c(2,5),])
apply(t(t(a)/colSums(a))*100,2,mg14:::roundProp)
## clonal subclonal
## APOBEC 64 7
## SBS1 1 2
## SBS3 31 85
## SBS5 4 6
Now assess how the signature attribution changes in samples with measurable spectral shifts. These data were calculated by Clemency Jolly.
sfc <- read.table("../ref/2019-01-03-allSignatureChanges.txt", header=TRUE, sep="\t")
v <- sfc$samplename %in% names(which(p.adjust(pEarlyLate["p",])<0.05))
#mg14:::ggPlot(pmin(100,pmax(0.01,exp(sfc$log2fc_earlyLate[v]*log(2)))),sfc$signature[v], log='y')
#cairo_pdf("sigFcEarlyLate.pdf", 6, 3, pointsize=8)
.par()
par(mar=c(9,3,1,1), bty="n")
s <- names(sort(sapply(split(sfc$log2fc_earlyLate[v]*log(2)/log(10), sfc$signature[v]), median, na.rm=TRUE)))
s <- setdiff(s, c(names(which(table(sfc$signature[v])<10)),"n_unassigned"))
t <- donor2type[sample2donor[as.character(sfc$samplename)[v]]]
f <- function(x) pmax(-2,pmin(2,x*log(2)/log(10)))
x <- jitter(as.numeric(factor(as.character(sfc$signature[v]), levels=s)))
plot(f(sfc$log2fc_earlyLate[v]) ~ x, pch=NA, xaxt='n', xlab="", ylab="fold change", ylim=c(-2.2,2.2), xaxt="n", yaxt="n", lwd=0.5)
#segments(x, y0=f(sfc$lCI_earlyLate[v]), y1=f(sfc$uCI_earlyLate[v]), col='grey')
points(f(sfc$log2fc_earlyLate[v]) ~ x, bg=tissueColors[t], pch=21, col=tissueBorder[t], cex=tissueCex[t])
boxplot(sfc$log2fc_earlyLate[v]*log(2)/log(10) ~factor(as.character(sfc$signature[v]), levels=s), pch=NA, staplewex=0, lty=1, add=TRUE, col=NA, xaxt="n", yaxt="n")
mg14::rotatedLabel(x0=seq_along(s), labels=s)
axis(side=2, at=-2:2, labels=paste0(c("\u2265","","","","\u2264"),10^(-2:2)))
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <a5>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <a5>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <a4>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <a4>
m <- log10(2:9 %o% 10^(-2:1))
axis(side=2, at=m, tcl=-0.1, labels=rep("", length(m)))
axis(side=1, at=seq_along(s), labels=rep("", length(lengths(s))))
text(0,2.2, "late", pos=4)
text(0,-2.2, "early", pos=4)
abline(h=0, lty=3)
#dev.off()
xlsx output
Figure4c <- createSheet(Figure4, "Figure4c")
addDataFrame(data.frame(sfc[v,c("samplename","signature","log2fc_earlyLate")], tumour_type=t), Figure4c)
Some numbers
t(signif(2^sapply(split(sfc$log2fc_earlyLate[v] , sfc$signature[v])[s], quantile, na.rm=TRUE),2))
## 0% 25% 50% 75% 100%
## SBS7 0.0096 0.088 0.16 0.43 2.7
## SBS12 0.0017 0.058 0.22 0.41 1.6
## SBS4 0.0610 0.310 0.43 0.72 2.5
## SBS1 0.0100 0.430 0.73 1.60 54.0
## SBS5 0.0026 0.490 0.86 1.50 940.0
## SBS17 0.0082 0.250 0.93 3.50 510.0
## DBS8 0.6500 0.800 0.96 1.20 3.4
## DBS3 0.3700 0.810 0.98 1.30 1.5
## DBS2 0.0750 0.720 1.00 1.30 15.0
## ID1 0.0450 0.620 1.00 1.90 22.0
## ID13 0.3100 0.800 1.10 1.50 4.6
## DBS11 0.0780 0.780 1.10 1.70 9.1
## ID2 0.0610 0.700 1.10 2.00 22.0
## DBS7 0.4000 0.820 1.10 1.60 7.0
## SBS18 0.0074 0.770 1.10 2.10 5.8
## DBS6 0.3000 0.880 1.20 1.70 5.4
## DBS1 0.1800 0.790 1.20 1.90 3.7
## DBS9 0.2700 0.800 1.20 1.80 12.0
## DBS4 0.1600 0.820 1.20 1.70 11.0
## ID8 0.2100 0.850 1.20 2.10 32.0
## SBS37 0.6700 0.810 1.30 4.00 170.0
## SBS6.14.15.20.21.26.44 0.4300 1.000 1.30 1.80 4.5
## SBS40 0.1600 0.880 1.40 2.20 44.0
## SBS3 0.3500 0.960 1.50 2.00 26.0
## SBS29 0.4100 0.840 1.50 1.80 4.9
## DBS5 0.3900 0.890 1.50 3.70 33.0
## SBS41 0.0100 0.930 1.80 1.90 11.0
## SBS2.13 0.0190 0.820 2.00 3.60 86.0
## SBS38 0.6300 1.800 3.60 11.00 31.0
w <- sfc$samplename %in% names(which(p.adjust(pClonalSubclonal["p",])<0.05))
#cairo_pdf("sigFcClonalSubclonal.pdf", 6, 3, pointsize=8)
.par()
par(mar=c(9,3,1,1), bty="n")
s <- names(sort(sapply(split(sfc$log2fc_clonalSubclonal[w]*log(2)/log(10), sfc$signature[w]), median, na.rm=TRUE)))
s <- setdiff(s, c(names(which(table(sfc$signature[w])<10)),"n_unassigned"))
t <- donor2type[sample2donor[as.character(sfc$samplename)[w]]]
f <- function(x) pmax(-2,pmin(2,x*log(2)/log(10)))
x <- jitter(as.numeric(factor(as.character(sfc$signature[w]), levels=s)))
plot(f(sfc$log2fc_clonalSubclonal[w]) ~ x, pch=NA, xaxt='n', xlab="", ylab="fold change", ylim=c(-2.2,2.2), xaxt="n", yaxt="n", lwd=0.5)
#segments(x, y0=f(sfc$lCI_clonalSubclonal[v]), y1=f(sfc$uCI_clonalSubclonal[v]), col='grey')
points(f(sfc$log2fc_clonalSubclonal[w]) ~ x, bg=tissueColors[t], pch=21, col=tissueBorder[t], cex=tissueCex[t])
boxplot(f(sfc$log2fc_clonalSubclonal[w]) ~factor(as.character(sfc$signature[w]), levels=s), pch=NA, staplewex=0, lty=1, add=TRUE, col=NA, xaxt="n", yaxt="n")
mg14::rotatedLabel(x0=seq_along(s), labels=s)
axis(side=2, at=-2:2, labels=paste0(c("\u2265","","","","\u2264"),10^(-2:2)))
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <a5>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���0.01' in
## 'mbcsToSbcs': dot substituted for <a5>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <a4>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <89>
## Warning in axis(side = 2, at = -2:2, labels = paste0(c("<U+2265>", "", "", : conversion failure on '���100' in
## 'mbcsToSbcs': dot substituted for <a4>
m <- log10(2:9 %o% 10^(-2:1))
axis(side=2, at=m, tcl=-0.1, labels=rep("", length(m)))
axis(side=1, at=seq_along(s), labels=rep("", length(lengths(s))))
text(0,2.2, "subclonal", pos=4)
text(0,-2.2, "clonal", pos=4)
abline(h=0, lty=3)
#dev.off()
xlsx output
Figure4d <- createSheet(Figure4, "Figure4d")
addDataFrame(data.frame(sfc[w,c("samplename","signature","log2fc_clonalSubclonal")], tumour_type=t), Figure4d)
Some numbers
t(signif(2^sapply(split(sfc$log2fc_clonalSubclonal[w] , sfc$signature[w])[s], quantile, na.rm=TRUE),2))
## 0% 25% 50% 75% 100%
## SBS9 0.00083 0.019 0.053 0.43 2.20
## SBS7 0.00660 0.090 0.180 0.24 0.65
## SBS12 0.00190 0.090 0.270 0.44 1.60
## ID2 0.00069 0.200 0.480 1.30 120.00
## ID1 0.00830 0.220 0.550 1.50 240.00
## SBS17 0.00500 0.270 0.570 1.80 59.00
## SBS4 0.00110 0.410 0.580 0.95 4.90
## SBS1 0.00410 0.380 0.640 1.20 57.00
## SBS16 0.00190 0.013 0.730 1.10 8.80
## DBS8 0.34000 0.740 0.850 1.10 4.10
## SBS5 0.00200 0.400 0.860 1.50 370.00
## DBS10 0.73000 0.930 0.950 0.99 2.50
## SBS30 0.18000 0.510 0.960 2.80 380.00
## ID13 0.38000 0.760 0.990 1.50 12.00
## SBS3 0.03900 0.650 1.000 1.60 8.70
## DBS2 0.20000 0.730 1.000 1.60 8.70
## SBS41 0.00280 0.220 1.100 1.40 17.00
## SBS28 0.22000 0.660 1.100 1.50 4.60
## DBS4 0.20000 0.720 1.100 1.80 11.00
## DBS6 0.26000 0.780 1.100 1.80 9.10
## DBS11 0.25000 0.790 1.100 1.90 10.00
## SBS22 0.12000 0.500 1.200 2.10 18.00
## DBS9 0.19000 0.820 1.200 1.70 11.00
## DBS7 0.33000 0.930 1.200 2.00 9.20
## DBS3 0.15000 0.960 1.200 1.60 6.20
## SBS18 0.00470 0.740 1.200 2.00 15.00
## ID8 0.04700 0.630 1.300 2.50 16.00
## SBS40 0.07900 0.750 1.400 2.20 24.00
## SBS29 0.01300 1.000 1.400 2.40 7.50
## DBS5 0.28000 0.700 1.600 2.50 14.00
## SBS2.13 0.03400 0.790 1.600 2.60 170.00
## DBS1 0.36000 0.930 1.600 2.50 8.20
## SBS6.14.15.20.21.26.44 0.23000 1.200 1.800 3.00 72.00
## SBS37 0.21000 1.600 29.000 350.00 540.00
Legend for figure
#pdf("legend.pdf", 2,4)
.par()
par(mar=c(0,0,0,0), bty="n")
plot(NA,NA, xlim=c(-1,1), ylim=c(-1,1), xaxt="n",yaxt="n", xlab='', ylab="")
l <- as.character(sort(unique(donor2type[sample2donor[as.character(sfc$samplename)[w | v]]])))
n <- unique(c(names(which(p.adjust(pClonalSubclonal["p",])<0.05)),names(which(p.adjust(pEarlyLate["p",])<0.05))))
m <- unique(c(names(pClonalSubclonal["p",]),names(pEarlyLate["p",])))
s <- table(donor2type[sample2donor[n]])[l]
t <- table(donor2type[sample2donor[m]])[l]
#l <- l[order(-s/t)]
legend("top", legend=paste0(l, " (",s[l],"/",t[l],")"), pt.bg=tissueColors[l], pch=21, col=tissueBorder[l], pt.cex=tissueCex[l], bty="n")
dev.off()
## null device
## 1
Save output underlying figures.
saveWorkbook(Figure1,'Figure1.xlsx')
saveWorkbook(Figure2,'Figure2.xlsx')
saveWorkbook(Figure4,'Figure4.xlsx')
saveWorkbook(Figure5,'Figure5.xlsx')
saveWorkbook(ExtendedDataFigure3,'ExtendedDataFigure3.xlsx')
saveWorkbook(ExtendedDataFigure6,'ExtendedDataFigure6.xlsx')
saveWorkbook(ExtendedDataFigure8,'ExtendedDataFigure8.xlsx')
saveWorkbook(ExtendedDataFigure9,'ExtendedDataFigure9.xlsx')
Other working group output, to be released on synapse https://www.synapse.org/#!Synapse:syn14193595. These data may be more useful than the specific subsets underlying each published figure.
n <- names(finalSnv)
wgdMrcaTimingData <- data.frame(
uuid=n,
icgc_sample_id=sample2icgc[n],
icgc_donor_id=sample2donor[n],
tissue=donor2type[sample2donor[n]],
WGD=isWgd,
ploidy=finalPloidy,
eff_ploidy=effGenome,
purity=finalPurity,
age=round(age[sample2donor[n]]),
n_snv_mnv=sapply(finalSnv, nrow),
CpG_TpG_trunk_pwradj=round(branchDeam[,2],2),
CpG_TpG_subclonal_branch_pwradj=round(branchDeam[,1],2),
CpG_TpG_subclonal_linear_pwradj=round(branchDeamLinear[,1],2),
TiN = TiN[sample2donor[n]],
remove = n %in% remove,
accel = ifelse(n %in% remove,NA,guessAccel[as.character(donor2type[sample2donor[n]])]),
row.names=n)
t <- as.data.frame(round(Reduce("rbind", wgdTimeAbsType),2))
colnames(t) <- c("time", "time.lo","time.up")
w <- data.frame(row.names=rownames(t), t, `CpG_TpG_total`=nDeam[rownames(t)])
colnames(w) <- sub("lo", "10%", sub("up","90%", colnames(w)))
t <- cbind(branching=as.data.frame(round(Reduce("rbind", subclonesTimeAbsType),2)), linear=as.data.frame(round(Reduce("rbind", subclonesTimeAbsTypeLinear),2)))
colnames(t) <- sub("\\.hat","", colnames(t))
wgdMrcaTimingData <- cbind(wgdMrcaTimingData,
WGD=w[rownames(wgdMrcaTimingData),],
MRCA.time=t[rownames(wgdMrcaTimingData),]
)
write.table(wgdMrcaTimingData, file=paste0(Sys.Date(),"-wgdMrcaTiming.txt"), sep="\t", quote=FALSE, row.names=FALSE)
t <- do.call("rbind", mclapply(finalBB, function(bb) as.data.frame(bb[,c(1:6,38:47)])))
n <- rownames(t)
t <- as.data.frame(lapply(t, function(x) if(class(x)=="numeric") round(x,3) else x))
t$sample <- sub("\\..+","",n)
write.table(t, file=paste0(Sys.Date(),"-allSegmentsTimeRaw.txt"), quote=FALSE, sep="\t")
t <- as.data.frame(finalDriversAnnotated)[c(1,2,6,7,9,10,11,12,13,14,31:44)]
t <- as.data.frame(lapply(t, function(x) if(class(x)=="numeric") round(x,3) else x));
write.table(t, file=paste0(Sys.Date(),"-driversTiming.txt"), quote=FALSE, sep="\t")
cat(system("git log -n1", intern=TRUE), sep="\n")
## commit d395e6ed6b447f40b49fc09a5e934ea1d7382760
## Author: Moritz Gerstung <moritz.gerstung@ebi.ac.uk>
## Date: Fri Aug 23 09:41:34 2019 +0100
##
## Fixed header
l <- ls()
data.frame(variable=l, Reduce("rbind",lapply(l, function(x) data.frame(class=class(get(x)), size=format(object.size(get(x)), units="auto")))))
## variable class size
## 1 CANCERGENES character 52.4 Kb
## 2 ExtendedDataFigure3 jobjRef 944 bytes
## 3 ExtendedDataFigure3b jobjRef 944 bytes
## 4 ExtendedDataFigure3d jobjRef 944 bytes
## 5 ExtendedDataFigure6 jobjRef 944 bytes
## 6 ExtendedDataFigure6a jobjRef 944 bytes
## 7 ExtendedDataFigure6b jobjRef 944 bytes
## 8 ExtendedDataFigure8 jobjRef 944 bytes
## 9 ExtendedDataFigure8a jobjRef 944 bytes
## 10 ExtendedDataFigure8b jobjRef 944 bytes
## 11 ExtendedDataFigure8c jobjRef 944 bytes
## 12 ExtendedDataFigure8d jobjRef 944 bytes
## 13 ExtendedDataFigure8e jobjRef 944 bytes
## 14 ExtendedDataFigure8f jobjRef 944 bytes
## 15 ExtendedDataFigure8g jobjRef 944 bytes
## 16 ExtendedDataFigure8h jobjRef 944 bytes
## 17 ExtendedDataFigure9 jobjRef 944 bytes
## 18 ExtendedDataFigure9a jobjRef 944 bytes
## 19 ExtendedDataFigure9b jobjRef 944 bytes
## 20 ExtendedDataFigure9c jobjRef 944 bytes
## 21 ExtendedDataFigure9d jobjRef 944 bytes
## 22 ExtendedDataFigure9e jobjRef 944 bytes
## 23 ExtendedDataFigure9f jobjRef 944 bytes
## 24 ExtendedDataFigure9g jobjRef 944 bytes
## 25 Figure1 jobjRef 944 bytes
## 26 Figure1f jobjRef 944 bytes
## 27 Figure1g jobjRef 944 bytes
## 28 Figure1h jobjRef 944 bytes
## 29 Figure2 jobjRef 944 bytes
## 30 Figure2a jobjRef 944 bytes
## 31 Figure2a2 jobjRef 944 bytes
## 32 Figure2a3 jobjRef 944 bytes
## 33 Figure2d jobjRef 944 bytes
## 34 Figure4 jobjRef 944 bytes
## 35 Figure4a jobjRef 944 bytes
## 36 Figure4b jobjRef 944 bytes
## 37 Figure4c jobjRef 944 bytes
## 38 Figure4d jobjRef 944 bytes
## 39 Figure5 jobjRef 944 bytes
## 40 Figure5b jobjRef 944 bytes
## 41 Figure5c jobjRef 944 bytes
## 42 I matrix 1.2 Mb
## 43 J matrix 1.3 Mb
## 44 T.i.F matrix 294.3 Kb
## 45 TiN numeric 202.2 Kb
## 46 a matrix 944 bytes
## 47 ab array 9.1 Kb
## 48 accel numeric 640 bytes
## 49 accelRelWgd list 83 Kb
## 50 addAssignment function 12.1 Kb
## 51 addDriver function 27.1 Kb
## 52 addFinalDriver function 16.7 Kb
## 53 addMutCn function 12.5 Kb
## 54 addTNC function 18.8 Kb
## 55 age numeric 202.2 Kb
## 56 aggregatePerChromosome function 32.4 Kb
## 57 allChrAgg array 2.8 Mb
## 58 allChrCancerHist array 79 Kb
## 59 allGender data.frame 1.3 Mb
## 60 applyPigeonHole function 16.8 Kb
## 61 asum function 18.4 Kb
## 62 at function 59.7 Kb
## 63 averageEvenPloidy function 8.8 Kb
## 64 averageHom function 24.2 Kb
## 65 averagePloidy function 28.9 Kb
## 66 avgWeights function 11.1 Kb
## 67 b matrix 984 bytes
## 68 b_mrca numeric 288 bytes
## 69 b_primary numeric 288 bytes
## 70 b_relapse numeric 288 bytes
## 71 basePath character 168 bytes
## 72 bb GRanges 12.6 Kb
## 73 bbFiles character 48 bytes
## 74 bbPath character 232 bytes
## 75 bbToTime function 72.8 Kb
## 76 betaFromCi function 123.9 Kb
## 77 branchDeam matrix 326 Kb
## 78 branchDeamLinear matrix 326 Kb
## 79 branchLengths function 19 Kb
## 80 bwd numeric 56 bytes
## 81 c character 5.3 Kb
## 82 cc list 43.9 Kb
## 83 chrOffset numeric 6.8 Kb
## 84 ci matrix 1 Kb
## 85 classWgd function 1.7 Kb
## 86 classifyMutations function 58.8 Kb
## 87 clinicalData data.frame 949.7 Kb
## 88 clusters data.frame 1 Kb
## 89 clustersFromBB function 12.8 Kb
## 90 cn data.frame 132.3 Mb
## 91 cnPath character 232 bytes
## 92 cnStates list 2.9 Kb
## 93 cnWeights function 8.9 Kb
## 94 col character 736 bytes
## 95 colTime character 1 Kb
## 96 colon_sbs data.frame 105.1 Kb
## 97 computeMutCn function 330.1 Kb
## 98 computeSubclonesTimeAbs function 45.7 Kb
## 99 computeWgdParamDeam function 41.1 Kb
## 100 consensusClustersToOld function 5.6 Kb
## 101 cosineDist function 24.6 Kb
## 102 cpgtpg logical 511.4 Kb
## 103 ct integer 250.8 Kb
## 104 d matrix 88 Kb
## 105 d0 numeric 3.2 Kb
## 106 d1 numeric 3.2 Kb
## 107 d50 integer 64 bytes
## 108 dClonalSubclonal numeric 70.9 Kb
## 109 dEarlyLate numeric 64.8 Kb
## 110 data list 1.4 Mb
## 111 dbetabinom function 6.6 Kb
## 112 deamRateWgd list 71.5 Kb
## 113 defineMcnStates function 1.3 Mb
## 114 df data.frame 243.6 Kb
## 115 distAsRange function 17.2 Kb
## 116 donor2sample list 26.1 Kb
## 117 donor2type factor 211 Kb
## 118 doubleGains data.frame 295.4 Kb
## 119 doubleGainsAggregated data.frame 41.8 Kb
## 120 dpFiles character 325.6 Kb
## 121 dpPath character 168 bytes
## 122 dtrbetabinom function 9.9 Kb
## 123 dtrbinom function 3.1 Kb
## 124 dumpfile character 136 bytes
## 125 e matrix 1.1 Kb
## 126 effGenome numeric 304 Kb
## 127 f function 15.1 Kb
## 128 finalBB list 3.8 Gb
## 129 finalBBGray list 73.5 Mb
## 130 finalClusters list 3 Mb
## 131 finalClustersGray list 84.7 Kb
## 132 finalData data.frame 11 Mb
## 133 finalDrivers VRanges 1.5 Mb
## 134 finalDriversAnnotated VRanges 2.2 Mb
## 135 finalGenotypes array 156.2 Mb
## 136 finalGenotypesP array 104.3 Mb
## 137 finalGenotypesQ array 21.1 Mb
## 138 finalHom numeric 304 Kb
## 139 finalIndel list 2 Gb
## 140 finalIndelGray list 673.6 Mb
## 141 finalPloidy numeric 304 Kb
## 142 finalPower list 622.3 Kb
## 143 finalPurity numeric 304 Kb
## 144 finalPurityGray numeric 8.4 Kb
## 145 finalSnv list 21.8 Gb
## 146 finalSnvGray list 1.7 Gb
## 147 finalSv list 3.2 Gb
## 148 findMainCluster function 117.5 Kb
## 149 first_time logical 56 bytes
## 150 fit stanfit 63 Mb
## 151 flattenBB function 23.5 Kb
## 152 fm numeric 2.7 Kb
## 153 fmq matrix 3.4 Kb
## 154 foo data.frame 3.1 Mb
## 155 fracGenomeWgdComp matrix 500.3 Kb
## 156 fractionGenomeWgdCompatible function 185.5 Kb
## 157 g matrix 63.1 Kb
## 158 getAltCount function 168.4 Kb
## 159 getGenotype function 18.6 Kb
## 160 getMutationCluster function 7.2 Kb
## 161 getTumorCounts function 7.4 Kb
## 162 getTumorDepth function 81.1 Kb
## 163 grayList logical 10.9 Kb
## 164 gt matrix 205.2 Kb
## 165 gtu matrix 151 Kb
## 166 gu matrix 37.1 Kb
## 167 guessAccel character 3.1 Kb
## 168 h histogram 2.3 Kb
## 169 hh matrix 3.9 Kb
## 170 hi integer 64 bytes
## 171 histBeta function 166.9 Kb
## 172 hsc_data data.frame 72.4 Mb
## 173 i integer 56 bytes
## 174 isDeamination function 2.8 Kb
## 175 isDeaminationNoUV function 2.8 Kb
## 176 isWgd logical 293.2 Kb
## 177 is_cpgtpg logical 288.7 Kb
## 178 j numeric 544 bytes
## 179 l character 23.9 Kb
## 180 lineCol character 5.2 Kb
## 181 lo integer 64 bytes
## 182 loadAssignment function 6.9 Kb
## 183 loadBB function 8.5 Kb
## 184 loadClusters function 9.2 Kb
## 185 loadCn function 11.4 Kb
## 186 loadConsensusCNA function 55.7 Kb
## 187 loadConsensusClusters function 9.2 Kb
## 188 loadPositions function 9.5 Kb
## 189 loadVcf function 27.6 Kb
## 190 m character 273 Kb
## 191 m0 numeric 81.9 Kb
## 192 ma numeric 3.3 Kb
## 193 madClonalSubclonal numeric 77.9 Kb
## 194 madEarlyLate numeric 57.6 Kb
## 195 makeTitle function 19.8 Kb
## 196 matchDrivers function 16.5 Kb
## 197 mcnHeader function 13.3 Kb
## 198 mergeClusters function 23.4 Kb
## 199 mergeClustersByMutreadDiff function 71.1 Kb
## 200 model_code character 152 bytes
## 201 my_png function 26.9 Kb
## 202 n character 282.2 Kb
## 203 nClones integer 293.2 Kb
## 204 nDeam integer 79 Kb
## 205 nDeam22 numeric 81.9 Kb
## 206 nSub integer 293.2 Kb
## 207 nSubclones integer 2.8 Kb
## 208 nWgd integer 2.2 Kb
## 209 n_cpgtpg numeric 10.1 Kb
## 210 na.rm function 1.7 Kb
## 211 nmSolve function 136.8 Kb
## 212 normal_colon_cpgtpg numeric 440 bytes
## 213 normal_cpgtpg matrix 1.1 Kb
## 214 normal_endometrium_cpgtpg numeric 440 bytes
## 215 normal_hsc_cpgtpg numeric 440 bytes
## 216 normal_skin_cpgtpg numeric 56 bytes
## 217 o integer 176 bytes
## 218 p list 164.4 Kb
## 219 pClonalSubclonal matrix 299.3 Kb
## 220 pEarlyLate matrix 232.4 Kb
## 221 pGainToTime function 26.3 Kb
## 222 parseRegion function 10.8 Kb
## 223 pbetabinom function 6.6 Kb
## 224 pi matrix 248 bytes
## 225 piToTime function 97.1 Kb
## 226 plotBB function 553 Kb
## 227 plotPoset function 9.2 Kb
## 228 plotSample function 609.1 Kb
## 229 plotSv function 354.9 Kb
## 230 plotTiming function 517.8 Kb
## 231 plotVcf function 342 Kb
## 232 posetDist function 14.9 Kb
## 233 posteriorMutCN function 18.1 Kb
## 234 power function 6.7 Kb
## 235 predictRealTime function 8.4 Kb
## 236 prgn character 792 bytes
## 237 probGenotype function 17.8 Kb
## 238 probGenotypeTail function 16.6 Kb
## 239 ptrbetabinom function 8.8 Kb
## 240 pu matrix 44.9 Kb
## 241 purity numeric 56 bytes
## 242 purityPloidy data.frame 370.7 Kb
## 243 q matrix 4.6 Kb
## 244 q1 numeric 304 Kb
## 245 q5 numeric 304 Kb
## 246 qAccelRelWgd matrix 3.7 Kb
## 247 qPanCan numeric 584 bytes
## 248 qRateDeam matrix 4.6 Kb
## 249 qSubclone array 11.6 Kb
## 250 qSubcloneLinear array 11.6 Kb
## 251 qWgd array 9.2 Kb
## 252 r function 1.6 Kb
## 253 rateDeam list 311.4 Kb
## 254 reduceBB function 30.1 Kb
## 255 reduceToCoverRelations function 43 Kb
## 256 refFile character 136 bytes
## 257 refLengths GRanges 23.1 Kb
## 258 relapse data.frame 8.8 Kb
## 259 remove character 75.5 Kb
## 260 removeSuperclones function 24.2 Kb
## 261 rho numeric 56 bytes
## 262 s table 3.4 Kb
## 263 sample2donor character 481.1 Kb
## 264 sample2icgc character 506.8 Kb
## 265 sbs data.frame 61.1 Kb
## 266 selectedSamples logical 10.9 Kb
## 267 set1 character 680 bytes
## 268 sfc data.frame 2.8 Mb
## 269 sigCol matrix 1.9 Kb
## 270 simulateMutations function 57.6 Kb
## 271 specimenData data.frame 2.6 Mb
## 272 stackTime function 31 Kb
## 273 subclonesTimeAbs list 708.4 Kb
## 274 subclonesTimeAbsLinear list 708.4 Kb
## 275 subclonesTimeAbsType list 277.5 Kb
## 276 subclonesTimeAbsTypeLinear list 277.5 Kb
## 277 t data.frame 1.9 Mb
## 278 t0 numeric 81.9 Kb
## 279 t2 numeric 56 bytes
## 280 t3 numeric 56 bytes
## 281 t_mrca numeric 288 bytes
## 282 t_primary numeric 288 bytes
## 283 t_relapse numeric 288 bytes
## 284 tab data.frame 4.4 Kb
## 285 testDriver function 5.4 Kb
## 286 testIndel function 5.4 Kb
## 287 timeToBeta function 17.9 Kb
## 288 timingClass factor 294 Kb
## 289 timingInfo data.frame 578.4 Kb
## 290 timing_data matrix 3.3 Mb
## 291 tissueBorder character 3.4 Kb
## 292 tissueCex numeric 3.3 Kb
## 293 tissueColors character 5.2 Kb
## 294 tissueLines character 5.2 Kb
## 295 tissueLty numeric 3.3 Kb
## 296 tnc DNAStringSet 2.9 Mb
## 297 tncLrt function 80.8 Kb
## 298 tncTime array 4.4 Mb
## 299 tncToPyrimidine function 25.5 Kb
## 300 toGraph function 11.4 Kb
## 301 tpy numeric 56 bytes
## 302 tryExceptNull function 2.1 Kb
## 303 tt character 344 bytes
## 304 tt0 matrix 326 Kb
## 305 ttl function 38.5 Kb
## 306 typeNa character 496 bytes
## 307 typesSubclones character 2.5 Kb
## 308 u character 204.3 Kb
## 309 uniqueSamples logical 10.9 Kb
## 310 v logical 99.3 Kb
## 311 vcf CollapsedVCF 52.8 Mb
## 312 vcfIndel CollapsedVCF 45.5 Mb
## 313 vcfPath character 152 bytes
## 314 void logical 86.5 Kb
## 315 vv table 2.9 Kb
## 316 w data.frame 75.8 Kb
## 317 w22 numeric 81.9 Kb
## 318 wccClusters list 3 Mb
## 319 wccPurity numeric 21.8 Kb
## 320 wccTable data.frame 592.8 Kb
## 321 wemeClustersMerged list 3.4 Mb
## 322 wgdCancerHist matrix 6.1 Kb
## 323 wgdMrcaTimingData data.frame 1.4 Mb
## 324 wgdParamDeam list 50.8 Mb
## 325 wgdPoss logical 293.2 Kb
## 326 wgdStar factor 293.8 Kb
## 327 wgdStat factor 293.8 Kb
## 328 wgdTest function 14.9 Kb
## 329 wgdTimeAbs list 197.5 Kb
## 330 wgdTimeAbsType list 92.6 Kb
## 331 wgdTimeDeamAcc array 12.4 Mb
## 332 whiteList logical 10.9 Kb
## 333 wnmSolve function 41.6 Kb
## 334 wu logical 30.2 Kb
## 335 ww integer 56 bytes
## 336 x character 112 bytes
## 337 x0 numeric 280 bytes
## 338 xx numeric 1.8 Kb
## 339 y numeric 2.3 Kb
## 340 y0 numeric 2.3 Kb
## 341 y1 numeric 2.3 Kb
## 342 yy numeric 1.8 Kb
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS: /usr/lib64/R/lib/libRblas.so
## LAPACK: /usr/lib64/R/lib/libRlapack.so
##
## locale:
## [1] C
##
## attached base packages:
## [1] splines stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rstan_2.17.2 StanHeaders_2.17.2 ggplot2_3.2.1 xlsx_0.6.1
## [5] VGAM_1.1-1 VariantAnnotation_1.20.3 SummarizedExperiment_1.4.0 Biobase_2.34.0
## [9] Rsamtools_1.26.2 Biostrings_2.42.1 XVector_0.14.1 GenomicRanges_1.26.4
## [13] GenomeInfoDb_1.10.3 IRanges_2.8.2 S4Vectors_0.12.2 BiocGenerics_0.20.0
## [17] knitr_1.24
##
## loaded via a namespace (and not attached):
## [1] fs_1.3.1 bitops_1.0-6 usethis_1.5.1 devtools_2.1.0
## [5] bit64_0.9-7 RColorBrewer_1.1-2 rprojroot_1.3-2 tools_3.5.1
## [9] backports_1.1.4 R6_2.4.0 DBI_1.0.0 lazyeval_0.2.2
## [13] colorspace_1.4-1 withr_2.1.2 gridExtra_2.3 prettyunits_1.0.2
## [17] processx_3.4.1 bit_1.1-14 curl_4.0 compiler_3.5.1
## [21] mg14_0.0.5 cli_1.1.0 desc_1.2.0 rtracklayer_1.34.2
## [25] scales_1.0.0 nnls_1.4 mvtnorm_1.0-11 callr_3.3.1
## [29] stringr_1.4.0 digest_0.6.20 foreign_0.8-70 rmarkdown_1.14
## [33] rio_0.5.16 pkgconfig_2.0.2 htmltools_0.3.6 sessioninfo_1.1.1
## [37] BSgenome_1.42.0 highr_0.8 rlang_0.4.0 readxl_1.3.1
## [41] RSQLite_2.1.2 BiocParallel_1.8.2 zip_2.0.3 car_3.0-3
## [45] inline_0.3.15 RCurl_1.95-4.12 magrittr_1.5 Matrix_1.2-14
## [49] Rcpp_1.0.2 munsell_0.5.0 abind_1.4-5 stringi_1.4.3
## [53] yaml_2.2.0 carData_3.0-2 zlibbioc_1.20.0 pkgbuild_1.0.4
## [57] grid_3.5.1 blob_1.2.0 forcats_0.4.0 crayon_1.3.4
## [61] lattice_0.20-35 haven_2.1.1 GenomicFeatures_1.26.4 xlsxjars_0.6.1
## [65] hms_0.5.0 zeallot_0.1.0 ps_1.3.0 pillar_1.4.2
## [69] codetools_0.2-15 biomaRt_2.30.0 pkgload_1.0.2 glue_1.3.1
## [73] XML_3.98-1.20 evaluate_0.14 remotes_2.1.0 data.table_1.12.2
## [77] vctrs_0.2.0 testthat_2.2.1 cellranger_1.1.0 gtable_0.3.0
## [81] assertthat_0.2.1 xfun_0.8 openxlsx_4.1.0.1 tibble_2.1.3
## [85] rJava_0.9-11 GenomicAlignments_1.10.1 AnnotationDbi_1.36.2 beeswarm_0.2.3
## [89] memoise_1.1.0
devtools::session_info()
## - Session info -------------------------------------------------------------------------------------------------------
## setting value
## version R version 3.5.1 (2018-07-02)
## os Red Hat Enterprise Linux
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate C
## ctype C
## tz Europe/London
## date 2019-08-23
##
## - Packages -----------------------------------------------------------------------------------------------------------
## package * version date lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 3.5.1)
## AnnotationDbi 1.36.2 2019-08-15 [1] Bioconductor
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.5.1)
## backports 1.1.4 2019-04-10 [1] CRAN (R 3.5.1)
## beeswarm 0.2.3 2016-04-25 [1] CRAN (R 3.5.1)
## Biobase * 2.34.0 2019-08-15 [1] Bioconductor
## BiocGenerics * 0.20.0 2019-08-15 [1] Bioconductor
## BiocParallel 1.8.2 2019-08-15 [1] Bioconductor
## biomaRt 2.30.0 2019-08-15 [1] Bioconductor
## Biostrings * 2.42.1 2019-08-15 [1] Bioconductor
## bit 1.1-14 2018-05-29 [1] CRAN (R 3.5.1)
## bit64 0.9-7 2017-05-08 [1] CRAN (R 3.5.1)
## bitops 1.0-6 2013-08-17 [1] CRAN (R 3.5.1)
## blob 1.2.0 2019-07-09 [1] CRAN (R 3.5.1)
## BSgenome 1.42.0 2019-08-15 [1] Bioconductor
## callr 3.3.1 2019-07-18 [1] CRAN (R 3.5.1)
## car 3.0-3 2019-05-27 [1] CRAN (R 3.5.1)
## carData 3.0-2 2018-09-30 [1] CRAN (R 3.5.1)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.5.1)
## cli 1.1.0 2019-03-19 [1] CRAN (R 3.5.1)
## codetools 0.2-15 2016-10-05 [2] CRAN (R 3.5.1)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.5.1)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.5.1)
## curl 4.0 2019-07-22 [1] CRAN (R 3.5.1)
## data.table 1.12.2 2019-04-07 [1] CRAN (R 3.5.1)
## DBI 1.0.0 2018-05-02 [1] CRAN (R 3.5.1)
## desc 1.2.0 2018-05-01 [1] CRAN (R 3.5.1)
## devtools 2.1.0 2019-07-06 [1] CRAN (R 3.5.1)
## digest 0.6.20 2019-07-04 [1] CRAN (R 3.5.1)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.5.1)
## forcats 0.4.0 2019-02-17 [1] CRAN (R 3.5.1)
## foreign 0.8-70 2017-11-28 [2] CRAN (R 3.5.1)
## fs 1.3.1 2019-05-06 [1] CRAN (R 3.5.1)
## GenomeInfoDb * 1.10.3 2019-08-15 [1] Bioconductor
## GenomicAlignments 1.10.1 2019-08-15 [1] Bioconductor
## GenomicFeatures 1.26.4 2019-08-15 [1] Bioconductor
## GenomicRanges * 1.26.4 2019-08-15 [1] Bioconductor
## ggplot2 * 3.2.1 2019-08-10 [1] CRAN (R 3.5.1)
## glue 1.3.1 2019-03-12 [1] CRAN (R 3.5.1)
## gridExtra 2.3 2017-09-09 [1] CRAN (R 3.5.1)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.5.1)
## haven 2.1.1 2019-07-04 [1] CRAN (R 3.5.1)
## highr 0.8 2019-03-20 [1] CRAN (R 3.5.1)
## hms 0.5.0 2019-07-09 [1] CRAN (R 3.5.1)
## htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.5.1)
## inline 0.3.15 2018-05-18 [1] CRAN (R 3.5.1)
## IRanges * 2.8.2 2019-08-15 [1] Bioconductor
## knitr * 1.24 2019-08-08 [1] CRAN (R 3.5.1)
## lattice 0.20-35 2017-03-25 [2] CRAN (R 3.5.1)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 3.5.1)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.5.1)
## Matrix 1.2-14 2018-04-13 [2] CRAN (R 3.5.1)
## memoise 1.1.0 2017-04-21 [1] CRAN (R 3.5.1)
## mg14 0.0.5 2019-08-15 [1] Github (mg14/mg14@6a63283)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.5.1)
## mvtnorm 1.0-11 2019-06-19 [1] CRAN (R 3.5.1)
## nnls 1.4 2012-03-19 [1] CRAN (R 3.5.1)
## openxlsx 4.1.0.1 2019-05-28 [1] CRAN (R 3.5.1)
## pillar 1.4.2 2019-06-29 [1] CRAN (R 3.5.1)
## pkgbuild 1.0.4 2019-08-05 [1] CRAN (R 3.5.1)
## pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.5.1)
## pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.5.1)
## prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.5.1)
## processx 3.4.1 2019-07-18 [1] CRAN (R 3.5.1)
## ps 1.3.0 2018-12-21 [1] CRAN (R 3.5.1)
## R6 2.4.0 2019-02-14 [1] CRAN (R 3.5.1)
## RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 3.5.1)
## Rcpp 1.0.2 2019-07-25 [1] CRAN (R 3.5.1)
## RCurl 1.95-4.12 2019-03-04 [1] CRAN (R 3.5.1)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.5.1)
## remotes 2.1.0 2019-06-24 [1] CRAN (R 3.5.1)
## rio 0.5.16 2018-11-26 [1] CRAN (R 3.5.1)
## rJava 0.9-11 2019-03-29 [1] CRAN (R 3.5.1)
## rlang 0.4.0 2019-06-25 [1] CRAN (R 3.5.1)
## rmarkdown 1.14 2019-07-12 [1] CRAN (R 3.5.1)
## rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.5.1)
## Rsamtools * 1.26.2 2019-08-15 [1] Bioconductor
## RSQLite 2.1.2 2019-07-24 [1] CRAN (R 3.5.1)
## rstan * 2.17.2 2017-12-21 [1] CRAN (R 3.5.1)
## rtracklayer 1.34.2 2019-08-15 [1] Bioconductor
## S4Vectors * 0.12.2 2019-08-15 [1] Bioconductor
## scales 1.0.0 2018-08-09 [1] CRAN (R 3.5.1)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.1)
## StanHeaders * 2.17.2 2018-01-20 [1] CRAN (R 3.5.1)
## stringi 1.4.3 2019-03-12 [1] CRAN (R 3.5.1)
## stringr 1.4.0 2019-02-10 [1] CRAN (R 3.5.1)
## SummarizedExperiment * 1.4.0 2019-08-15 [1] Bioconductor
## testthat 2.2.1 2019-07-25 [1] CRAN (R 3.5.1)
## tibble 2.1.3 2019-06-06 [1] CRAN (R 3.5.1)
## usethis 1.5.1 2019-07-04 [1] CRAN (R 3.5.1)
## VariantAnnotation * 1.20.3 2019-08-15 [1] Bioconductor
## vctrs 0.2.0 2019-07-05 [1] CRAN (R 3.5.1)
## VGAM * 1.1-1 2019-02-18 [1] CRAN (R 3.5.1)
## withr 2.1.2 2018-03-15 [1] CRAN (R 3.5.1)
## xfun 0.8 2019-06-25 [1] CRAN (R 3.5.1)
## xlsx * 0.6.1 2018-06-11 [1] CRAN (R 3.5.1)
## xlsxjars 0.6.1 2014-08-22 [1] CRAN (R 3.5.1)
## XML 3.98-1.20 2019-06-06 [1] CRAN (R 3.5.1)
## XVector * 0.14.1 2019-08-15 [1] Bioconductor
## yaml 2.2.0 2018-07-25 [1] CRAN (R 3.5.1)
## zeallot 0.1.0 2018-01-28 [1] CRAN (R 3.5.1)
## zip 2.0.3 2019-07-03 [1] CRAN (R 3.5.1)
## zlibbioc 1.20.0 2019-08-15 [1] Bioconductor
##
## [1] /homes/mg14/R/x86_64-redhat-linux-gnu-library/3.5
## [2] /usr/lib64/R/library
## [3] /usr/share/R/library
All code is available at github.com/gerstung-lab/PCAWG-11
See https://github.com/gerstung-lab/MutationTime.R
#' # Calculation of mutation copy numbers and timing parameters
#' Minimal example
#' source("/Users/mg14/Projects/PCAWG-11/code/MutationTime.R")
#' vcf <- readVcf("final/final_consensus_12oct_passonly/snv_mnv/0040b1b6-b07a-4b6e-90ef-133523eaf412.consensus.20160830.somatic.snv_mnv.vcf.gz",genome="GRCh37")
#' bb <- loadBB("dp/20161213_vanloo_wedge_consSNV_prelimConsCNAallStar/4_copynumber/0040b1b6-b07a-4b6e-90ef-133523eaf412_segments.txt.gz")
#' clusters = read.table("dp/20161213_vanloo_wedge_consSNV_prelimConsCNAallStar/2_subclones/0040b1b6-b07a-4b6e-90ef-133523eaf412_subclonal_structure.txt.gz", header=TRUE, sep="\t")
#' purityPloidy <- read.table("dp/20161213_vanloo_wedge_consSNV_prelimConsCNAallStar/1_purity_ploidy/purity_ploidy.txt, header=TRUE, sep="\t")
#' purity <- purityPloidy[2,2]
#' MCN <- computeMutCn(vcf[1:1000], bb, clusters, purity)
#' # Check output
#' # Mutation Annotation
#' head(MCN$D)
#' # Classify as basic clonal states
#' table(classifyMutations(MCN$D))
#' # Timing parameters
#' MCN$P[[1]]
#' # Extract timing of segments
#' bb$timing_param <- MCN$P
#' bbToTime(bb)
#' PRE: Purity must be equal to the main clone, otherwise spurious results can be generated
#' PRE: CNV segments don't have to overlap, otherwise an error is generated: "Error in if (!mixFlag[j]) { : missing value where TRUE/FALSE needed"
require(VariantAnnotation)
require(VGAM)
#' Extract tumour depth
#' @param vcf
#' @return
#'
#' @author mg14
#' @export
getTumorDepth <- function(vcf){
if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
info(vcf)$t_alt_count + info(vcf)$t_ref_count
}else{ ## older data
if("FAZ" %in% rownames(geno(header(vcf)))){
rowSums(getTumorCounts(vcf))
}else{
geno(vcf)$DEP[,2]
}
}
}
#' Get alt alleles counts
#' @param vcf
#' @return
#'
#' @author mg14
#' @export
getAltCount <- function(vcf){
if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
info(vcf)$t_alt_count
}else{ ## older formats
if("FAZ" %in% rownames(geno(header(vcf)))){ ## ie subs
c <- getTumorCounts(vcf)
t <- c[,1:4] + c[,5:8]
colnames(t) <- substring(colnames(t),2,2)
a <- as.character(unlist(alt(vcf)))
a[!a%in%c('A','T','C','G')] <- NA
sapply(seq_along(a), function(i) if(is.na(a[i])) NA else t[i, a[i]])
}
else{ ## ie indel
#(geno(vcf)$PP + geno(vcf)$NP + geno(vcf)$PB + geno(vcf)$NB)[,"TUMOUR"]
geno(vcf)$MTR[,2]
}
}
}
dtrbinom <- function(x, size, prob, xmin=0) dbinom(x,size,prob) / pbinom(xmin-1, size, prob, lower.tail=FALSE)
pbetabinom <- function(x, size, prob, rho){
if(rho!=0)
VGAM::pbetabinom(x, size, prob, rho)
else
pbinom(x, size, prob)
}
dbetabinom <- function(x, size, prob, rho){
if(rho!=0)
VGAM::dbetabinom(x, size, prob, rho)
else
dbinom(x, size, prob)
}
dtrbetabinom <- function(x, size, prob, rho, xmin=0) {y <- dbetabinom(x,size,prob,rho) / (1-pbetabinom(xmin-1, size, prob, rho))
y[x<xmin] <- 0
return(y)}
ptrbetabinom <- function(x, size, prob, rho, xmin=0) {
pmin <- pbetabinom(xmin-1, size, prob, rho)
pmax(0,(pbetabinom(x,size,prob,rho) - pmin) / (1-pmin))}
mergeClusters <- function(clusters, deltaFreq=0.05){
if(nrow(clusters) <= 1) return(clusters)
h <- hclust(dist(clusters$proportion))
ct <- cutree(h, h=deltaFreq)
cp <- as.matrix(cophenetic(h))
Reduce("rbind",lapply(unique(ct), function(i) {
n_ssms <- sum(clusters$n_ssms[ct==i])
w <- max(cp[ct==i,ct==i])
data.frame(new.cluster=i, n_ssms=n_ssms, proportion=sum(clusters$proportion[ct==i]*clusters$n_ssms[ct==i])/n_ssms, width=w)
}
))
}
#' Compute possible Mutation copy number states
#' @param bb Copy number with consensus annotation meta data
#' @param clusters
#' @param purity
#' @param gender
#' @param isWgd
#' @return list of length nrow(bb), can be added to mcols(bb)
#'
#' @author mg14
#' @export
defineMcnStates <- function(bb, clusters, purity, gender='female', isWgd= FALSE){
P <- vector(mode='list', length(bb))
uniqueBB <- unique(bb)
overlaps <- findOverlaps(uniqueBB, bb)
majorCN <- split(bb$major_cn[subjectHits(overlaps)], queryHits(overlaps))
m <- bb$minor_cn #hack: minor_cn > 0 in male samples - Battenberg bug?
if(gender=='male')
m[as.character(seqnames(bb)) %in% c('X','Y')] <- 0
minorCN <- split(m[subjectHits(overlaps)], queryHits(overlaps))
h <- selectHits(overlaps, "first")
H <- selectHits(overlaps, "last")
cnNormal <- 2 - (gender=='male' & seqnames(uniqueBB)=="X" | seqnames(uniqueBB)=="Y")
# Fix cluster and purity discrepancies
clusters$proportion[which.max(clusters$proportion)] <- purity
cloneFreq <- split(bb$clonal_frequency[subjectHits(overlaps)], queryHits(overlaps))
cnStates <- matrix(0, nrow=10000, ncol=6)
colnames(cnStates) <- c("state","m","f","n.m.s","pi.m.s","s")
power.c <- rep(0, nrow(clusters))
deltaFreq <- 0.05 # merge clusters withing deltaFreq
for( i in seq_along(uniqueBB)){
if(!i %in% names(majorCN)) next
majcni <- majorCN[[as.character(i)]]
mincni <- minorCN[[as.character(i)]]
cfi <- cloneFreq[[as.character(i)]]
effCnTumor <- sum((majcni + mincni)*cfi)
effCnNormal <- as.numeric(cnNormal[i]) * (1-purity)
if(any(is.na(majcni))) next
mixFlag <- FALSE
subclonalGainFlag <- FALSE
clonalFlag <- TRUE
majdelta <- 0
mindelta <- 0
majanc <- majder <- majcni
minanc <- minder <- mincni
if(length(cfi)>1){ # multiple (subclonal) CN states, if so add clonal option (ie. mixture of both states), subclonal states only change by 1..delta(CN)
d <- colSums(abs(rbind(majcni, mincni) - c(1,1) * (1+ isWgd)))
derived <- d == max(d) ## derived state further from diploid/tetraploid
if(all(derived)) next
majanc <- majcni[!derived]
minanc <- mincni[!derived]
majder <- majcni[derived]
minder <- mincni[derived]
majdelta <- majcni[derived] - majcni[!derived]
mindelta <- mincni[derived] - mincni[!derived]
majcni <- c(min(majcni), # clonal, sub on allele that doesn't change
(majcni[!derived] + cfi[derived]/purity*majdelta), # clonal, sub on allele that does change
(majcni[derived] >0) + (majdelta > 0)) # subclonal, subs after or before CNA, m=1,delta
mincni <- c(min(mincni), (mincni[!derived] + cfi[derived]/purity*mindelta), (mincni[derived] >0) + (mindelta > 0))
majdelta <- c(0, cfi[derived]/purity*majdelta, majdelta)
mindelta <- c(0, cfi[derived]/purity*mindelta, mindelta)
cfi <- c(purity, purity, cfi[derived])
mixFlag <- c(FALSE, TRUE, FALSE)
clonalFlag <- c(TRUE,TRUE,FALSE)
subclonalGainFlag <- c(FALSE, FALSE, TRUE)
}
a <- sapply(clusters$proportion, function(p) all(abs(p-cfi) > deltaFreq)) # subclone(s) not coinciding with CN change
if(any(a)){ # assume subclones have derived from most abundant CN state
majcni <- c(majcni, rep(majcni[which.max(cfi)]>0, sum(a))+0)
mincni <- c(mincni, rep(mincni[which.max(cfi)]>0, sum(a))+0)
cfi <- c(cfi, clusters$proportion[a])
mixFlag <- c(mixFlag, rep(FALSE, sum(a)))
clonalFlag <- c(clonalFlag, rep(FALSE, sum(a)))
subclonalGainFlag <- c(subclonalGainFlag, rep(FALSE, sum(a)))
majdelta <- c(majdelta, rep(0,sum(a)))
mindelta <- c(mindelta, rep(0,sum(a)))
}
totcni <- majcni+mincni
if(all(totcni==0)) next
k <- 0
for( j in seq_along(majcni)){
if(majcni[j]==0 & mincni[j]==0) {
f <- m <- 0 # allele frequency
pi.m.s <- n.m.s <- 1
l <- 1
}else{
l <- 1:max(majcni[j], mincni[j]) # mincni>majcni can occur if minor allele changes subclonally
m <- l
n.m.s <- rep(1, length(l)) #multiplicity of cn states
if(!mixFlag[j]){ # single subclone, or no subclonal cn
f <- m * cfi[j] / (effCnTumor + effCnNormal)
if(mincni[j] > 0)
n.m.s[1:min(majcni[j], mincni[j])] <- 2
pi.m.s <- n.m.s/sum(n.m.s)
}else{ # coexisting cn subclones, use mixture
d <- if(majdelta[j] != 0) majdelta[j] else mindelta[j]
M <- max(mincni[j]*(mindelta[j]!=0), majcni[j]*(majdelta[j]!=0))
m <- (d > 0):M + (M-floor(M))
l <- seq_along(m)
f <- m *cfi[j] / (effCnTumor + effCnNormal) # Variant on major allele
n.m.s <- rep(1, length(l))
pi.m.s <- n.m.s/sum(n.m.s)
}
}
cnStates[k + l,"state"]=j
cnStates[k + l,"m"]=m
cnStates[k + l,"f"]=f
cnStates[k + l,"pi.m.s"]=pi.m.s
cnStates[k + l,"n.m.s"]=n.m.s
k <- k + length(l)
}
whichStates <- (1:k)[cnStates[1:k,"f"]>0]
# State probabilities - based on cell fractions
cfi.s <- unique(cfi)
pi.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, clusters$n_ssms[which.min(abs(clusters$proportion - p))], 1))
pi.s <- pi.s/sum(pi.s)
c.to.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, which.min(abs(clusters$proportion - p)), NA)) # map to cluster
s.to.c <- sapply(clusters$proportion, function(c) which.min(abs(cfi.s - c)))
cnStates[1:k,"s"] = as.numeric(factor(cfi, levels=cfi.s))[cnStates[1:k,"state"]]
timing_param <- cbind(cnStates[whichStates,,drop=FALSE], cfi=cfi[cnStates[whichStates,"state"]], pi.s=pi.s[cnStates[whichStates,"s"]], P.m.sX=NA,P.m.sX.lo=NA, P.m.sX.up=NA, T.m.sX=NA, T.m.sX.lo=NA, T.m.sX.up=NA, power.s=NA, power.m.s = NA,
majCN=majcni[cnStates[whichStates,"state"]], minCN=mincni[cnStates[whichStates,"state"]],
majDelta = majdelta[cnStates[whichStates,"state"]], minDelta = mindelta[cnStates[whichStates,"state"]],
clonalFlag=clonalFlag[cnStates[whichStates,"state"]], subclonalGainFlag=subclonalGainFlag[cnStates[whichStates,"state"]], mixFlag=mixFlag[cnStates[whichStates,"state"]], majCNanc=majanc, minCNanc=minanc, majCNder=majder, minCNder=minder)
P[[h[i]]] <- timing_param
if(H[i] != h[i]) P[[H[[i]]]] <- P[[h[i]]]
}
return(P)
}
#' Compute Mutation copy numbers
#' @param vcf A vcf object of ssnms. See VariantAnnotation::readVcf()
#' @param bb The copy number as a GRanges() object, meta data in consensus format. See loadBB()
#' @param clusters A data.frame with the cluster proportion and n_ssms
#' @param purity The purity of the samples
#' @param gender 'male' or 'female'
#' @param isWgd TRUE/FALSE
#' @param xmin min read support. Needed for power calculations
#' @param rho Dispersion parameter
#' @return A list with elements (D: Annotated Data.Frame, can be added to vcf object; P: Timing parameters to be added to CN Ranges; power.c power of each cluster).
#'
#' @author mg14
#' @export
computeMutCn <- function(vcf, bb, clusters, purity, gender='female', isWgd= FALSE, xmin=3, rho=0, n.boot=200){
n <- nrow(vcf)
D <- DataFrame(MutCN=rep(NA,n), MutDeltaCN=rep(NA,n), MajCN=rep(NA,n), MinCN=rep(NA,n), MajDerCN=rep(NA,n), MinDerCN=rep(NA,n), CNF=rep(NA,n), CNID =as(vector("list", n),"List"), pMutCN=rep(NA,n), pGain=rep(NA,n),pSingle=rep(NA,n),pSub=rep(NA,n), pMutCNTail=rep(NA,n))
P <- defineMcnStates(bb,clusters, purity, gender, isWgd)
if(n==0)
return(list(D=D, P=P))
altCount <- getAltCount(vcf)
tumDepth <- getTumorDepth(vcf)
names(altCount) <- names(tumDepth) <- NULL
# Match VCF and CN
overlaps <- findOverlaps(vcf, bb)
D[["CNID"]] <- as(overlaps, "List")
majorCN <- split(bb$major_cn[subjectHits(overlaps)], queryHits(overlaps))
m <- bb$minor_cn #hack: minor_cn > 0 in male samples - Battenberg bug?
if(gender=='male')
m[as.character(seqnames(bb)) %in% c('X','Y')] <- 0
minorCN <- split(m[subjectHits(overlaps)], queryHits(overlaps))
h <- selectHits(overlaps, "first")
H <- selectHits(overlaps, "last")
cnNormal <- 2 - (gender=='male' & seqnames(vcf)=="X" | seqnames(vcf)=="Y")
# Fix cluster and purity discrepancies
clusters$proportion[which.max(clusters$proportion)] <- purity
cloneFreq <- split(bb$clonal_frequency[subjectHits(overlaps)], queryHits(overlaps))
power.c <- rep(0, nrow(clusters))
deltaFreq <- 0.05 # merge clusters withing deltaFreq
boundaryHits <- countOverlaps(vcf, unique(bb)) > 1 # indels overlapping with CN boundaries
for(globalIt in 1:2){ # 2 iterations, fist local (ie per segment) fit, then global
for( i in which( (diff(c(-1, h)) !=0 | is.na(diff(c(-1, h)) !=0) ) & ! boundaryHits )){
if(!i %in% names(majorCN)) next
if(is.na(h[i])) next
if(if(i>1) h[i] != h[i-1] | is.na(h[i-1]) else TRUE){ #ie. new segment
hh <- which(h==h[i] & !is.na(altCount) &! is.na(tumDepth))
if(length(hh)==0) next
if(is.null(bb$timing_param[[h[i]]])){
cnStates <- P[[h[i]]]
if(is.null(cnStates)) next
whichStates <- 1:nrow(cnStates)
# State probabilities - based on cell fractions
cfi.s <- unique(cnStates[,"cfi"])
pi.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, clusters$n_ssms[which.min(abs(clusters$proportion - p))], 1))
pi.s <- pi.s/sum(pi.s)
c.to.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, which.min(abs(clusters$proportion - p)), NA)) # map to cluster
s.to.c <- sapply(clusters$proportion, function(c) which.min(abs(cfi.s - c)))
# Likelihood
L <- matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) dtrbetabinom(altCount[hh],tumDepth[hh],pp, rho=rho, xmin=pmin(altCount[hh],0)) + .Machine$double.eps), ncol=length(whichStates))
# Power
power.sm <- colMeans(matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) 1-ptrbetabinom(pmin(altCount[hh],xmin-1),tumDepth[hh],pp, rho=rho, xmin=0)), ncol=length(whichStates)), na.rm=TRUE)
if(globalIt==2){
P.m.sX <- P[[h[i]]][,"P.m.sX"]
power.s <- sapply(split(power.sm * P.m.sX, cnStates[whichStates,"s"]), sum) # Power for state
power.s[!is.na(c.to.s)] <- power.c[na.omit(c.to.s)]
power.m.s <- power.sm # Relative power of m states (local) conditioned on s (global).
for(s in unique(cnStates[whichStates,"s"])) power.m.s[cnStates[whichStates,"s"]==s] <- power.m.s[cnStates[whichStates,"s"]==s] / max(power.m.s[cnStates[whichStates,"s"]==s]) #power.s[s]
}else{ # First iteration
power.m.s <- power.sm
power.s <- rep(1,length(whichStates))
}
mm <- function(x) {
x <- factor(x)
if(nlevels(x) > 1) t(model.matrix( ~ x + 0)) else matrix(1, ncol=length(x))
}
# EM algorithm (mixture fitting) for pi
P.m.sX <- cnStates[whichStates,"pi.m.s"]
s.from.m <- mm(cnStates[whichStates,"s"]) # indicator matrix to map
for(em.it in 1:100){
P.xsm <- L * rep(pi.s[cnStates[whichStates,"s"]] * P.m.sX / power.m.s / power.s[cnStates[whichStates,"s"]], each=nrow(L)) # P(X,s,m)
P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
P.sm.X <- pmax(.Machine$double.xmin,colMeans(P.sm.x)) # P(s,m|X) / piState[cnStates[1:k,"state"]] / cnStates[1:k,"pi.m.s"]
if(em.it==100) break
P.s.X <- s.from.m %*% P.sm.X
P.m.sX <- P.sm.X / P.s.X[cnStates[whichStates,"s"]]
}
toTime <- function(cnStates, P.m.sX, s.m) {
mAnc <- cnStates[,"m"] - cnStates[,"minDelta"] - cnStates[,"majDelta"]
mAnc.s <- factor(paste(mAnc, cnStates[,"s"], sep="."))
n <- (mAnc <= cnStates[,"majCNanc"]) + (mAnc <= cnStates[,"minCNanc"] )
mAnc.s.from.m <- mm(x = mAnc.s)# indicator matrix to map
return((mAnc.s.from.m[mAnc.s,] %*% P.m.sX) / (s.m[cnStates[,"s"],] %*% (P.m.sX * mAnc)) * (cnStates[,"majCNanc"] + cnStates[,"minCNanc"]) / n)
}
T.m.sX <- toTime(cnStates, P.m.sX, s.from.m)
if(globalIt==1){
p <- (sapply(split(power.sm * P.m.sX, cnStates[whichStates,"s"]), sum) * nrow(L))[s.to.c]
if(!any(is.na(p) | is.nan(p)))
power.c <- power.c + p
}
# Bootstrapping for CIs
if(globalIt==2){
b.m.sX <- if(n.boot>0) sapply(1:n.boot, function(foo){
L <- rbind(L, rep(1e-3, each=ncol(L))) #add an uniformative row
L <- L[sample(1:nrow(L), replace=TRUE),,drop=FALSE]
P.m.sX <- cnStates[whichStates,"pi.m.s"]
for(em.it in 1:50){
P.xsm <- L * rep(pi.s[cnStates[whichStates,"s"]] * P.m.sX / power.m.s / power.s[cnStates[whichStates,"s"]], each=nrow(L)) # P(X,s,m)
P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
P.sm.X <- colMeans(P.sm.x) # P(s,m|X) / piState[cnStates[1:k,"state"]] / cnStates[1:k,"pi.m.s"]
P.s.X <- s.from.m %*% P.sm.X
P.m.sX <- P.sm.X / P.s.X[cnStates[whichStates,"s"]]
}
return(P.m.sX)
}) else NA
if(n.boot>0) try({
CI.m.sX <- apply(b.m.sX, 1, quantile, c(0.025, 0.975))
cnStates[,"P.m.sX.lo"] <- CI.m.sX[1,]
cnStates[,"P.m.sX.up"] <- CI.m.sX[2,]
B.m.sX <- toTime(cnStates = cnStates, P.m.sX = b.m.sX, s.m = s.from.m)
C.m.sX <- apply(B.m.sX, 1, quantile, c(0.025, 0.975))
cnStates[,"T.m.sX.lo"] <- C.m.sX[1,]
cnStates[,"T.m.sX.up"] <- C.m.sX[2,]
}, silent=TRUE)
}
P.sm.x[apply(is.na(P.sm.x)|is.nan(P.sm.x),1,any),] <- NA
cnStates[,"P.m.sX"] <- P.m.sX
cnStates[,"T.m.sX"] <- T.m.sX
cnStates[,"power.s"] <- power.s[cnStates[whichStates,"s"]]
cnStates[,"power.m.s"] <- power.m.s
}else{
cnStates <- bb$timing_param[[h[i]]]
if(is.null(cnStates)) next
P.sm.x <- posteriorMutCN(x=altCount[hh],n=tumDepth[hh], cnStates=cnStates, xmin=0, rho=rho)
}
# Tail probs
pMutCNTail <- matrix(sapply(pmin(cnStates[,"f"],1), function(pp) ptrbetabinom(altCount[hh],tumDepth[hh],pp, rho=rho, xmin=pmin(altCount[hh],xmin))), ncol=nrow(cnStates)) #%*% c(pi.s[cnStates[whichStates,"state"]] * P.m.sX)
P[[h[i]]] <- cnStates
if(H[i] != h[i]) P[[H[[i]]]] <- P[[h[i]]]
w <- apply(P.sm.x, 1, function(x) if(any(is.na(x))) NA else which.max(x) )
if(all(is.na(w))) next
D[hh, "pSub"] <- rowSums(P.sm.x[, !cnStates[,"clonalFlag"], drop=FALSE])
D[hh, "pGain"] <- rowSums(P.sm.x[, cnStates[,"clonalFlag"] & cnStates[,"m"] > 1.00001 + cnStates[,"majDelta"] + cnStates[,"minDelta"], drop=FALSE])
#D[hh, "pSingle"] <- rowSums(P.sm.x[, cnStates[1:k,"state"] %in% which(clonalFlag) & cnStates[1:k,"m"]<=1, drop=FALSE])
D[hh, "pSingle"] <- 1 - D[hh, "pSub"] - D[hh, "pGain"]
D[hh,"MutCN"] <- cnStates[w,"m"]
D[hh,"MutDeltaCN"] <- cnStates[w,"majDelta"] + cnStates[w,"minDelta"]
D[hh,"MinCN"] <- cnStates[w,"minCNanc"]
D[hh,"MajCN"] <- cnStates[w,"majCNanc"]
D[hh,"MinDerCN"] <- cnStates[w,"minCNder"]
D[hh,"MajDerCN"] <- cnStates[w,"majCNder"]
D[hh,"CNF"] <- cnStates[w,"cfi"]
D[hh,"pMutCN"] <- sapply(seq_along(w), function(i) P.sm.x[i,w[i]])
D[hh,"pMutCNTail"] <- sapply(seq_along(w), function(i) pMutCNTail[i,w[i]])
}
}
if(globalIt==1){
power.c <- power.c / sum(!is.na(D[,"pSub"]))
if(any(is.na(power.c) | power.c==0)) {
break # Cancel 2nd iteration
warning("Power calculation yielded NA, aborting.")
}
if(any(power.c > 1)) {
warning("Calculated power > 1, rounding down.")
power.c <- pmin(1, power.c)
}
}
}
return(list(D=D,P=P, power.c=power.c))
}
mcnHeader <- function() {
DataFrame(Number=c(1,1,1,1,1,1,1,1,".",1,1,1,1),Type=c("Float","Float","Integer","Integer","Integer","Integer","Float","Float","Integer","Float","Float","Float","Float"),
Description=c("Mutation copy number","Change in MutCN between ancestral and derived state","Major copy number (ancestral)","Minor copy number (ancestral)","Major copy number (derived)","Minor copy number (derived)","Copy number frequency (relative to all cancer cells)", "MutCN probability","BB segment ids","Posterior prob: Early clonal","Posterior prob: Late clonal","Posterior prob: Subclonal", "Tail prob of mixture model"),
row.names=c("MutCN","MutDeltaCN","MajCN","MinCN","MajDerCN","MinDerCN","CNF","pMutCN","CNID","pGain","pSingle","pSub", "pMutCNTail"))
}
addMutCn <- function(vcf, bb=allBB[[meta(header(vcf))["ID",]]], clusters=allClusters[[meta(header(vcf))["ID",]]]){
i = info(header(vcf))
info(header(vcf)) <- rbind(i, mcnHeader())
info(vcf) <- cbind(info(vcf), computeMutCn(vcf, bb, clusters)$D)
return(vcf)
}
classifyMutations <- function(x, reclassify=c("missing","all","none")) {
reclassify <- match.arg(reclassify)
if(nrow(x) ==0 )
return(factor(NULL, levels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal")))
if(class(x)=="CollapsedVCF")
x <- info(x)
.clsfy <- function(x) {
cls <- x$CLS
if(reclassify %in% c("missing", "none") &! is.null(cls)){
if(all(unique(cls) %in% c("early", "late", "clonal", "subclonal")))
cls <- factor(cls, levels=c("early", "late", "clonal", "subclonal"), labels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal"))
cls <- as.character(cls)
cls[cls=="NA"] <- NA
if(reclassify=="missing" & any(is.na(cls)))
cls[is.na(cls)] <- paste(factor(apply(as.matrix(x[is.na(cls), c("pGain","pSingle","pSub")]), 1, function(x) if(all(is.na(x))) NA else which.max(x)), levels=1:3, labels=c("clonal [early]", "clonal [late]","subclonal"))) ## reclassify missing
}else{
cls <- paste(factor(apply(as.matrix(x[, c("pGain","pSingle","pSub")]), 1, function(x) if(all(is.na(x))) NA else which.max(x)), levels=1:3, labels=c("clonal [early]", "clonal [late]","subclonal"))) ## reclassify missing
}
cls[x$pGain==0 & cls!="subclonal"] <- "clonal [NA]"
if(!is.null(x$MajCN))
cls[cls!="subclonal" & (x$MajCN == 1 | x$MinCN == 1) & abs(x$MutCN - x$MutDeltaCN -1) <= 0.0001] <- "clonal [NA]"
cls <- factor(cls, levels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal"))
}
cls <- .clsfy(x)
return(cls)
}
posteriorMutCN <- function(x,n, cnStates, xmin=3, rho=0.01){
whichStates <- 1:nrow(cnStates)
L <- matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) dtrbetabinom(x,n,pp, rho=rho, xmin=pmin(x,xmin)) + .Machine$double.eps), ncol=length(whichStates))
P.xsm <- L * rep(cnStates[whichStates,"pi.s"] * cnStates[whichStates,"P.m.sX"] / cnStates[whichStates,"power.s"] / cnStates[whichStates,"power.m.s"], each=nrow(L)) # P(X,s,m)
P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
return(P.sm.x)
}
loadBB <- function(file){
tab <- read.table(file, header=TRUE, sep='\t')
GRanges(tab$chromosome, IRanges(tab$start, tab$end), strand="*", tab[-3:-1])
}
pGainToTime <- function(vcf){
P <- matrix(NA, nrow=nrow(vcf), ncol=4, dimnames=list(NULL, c("pEarly","pLate","pClonal[NA]","pSub")))
P[,c("pEarly","pClonal[NA]","pSub")] <- as.matrix(info(vcf)[,c("pGain","pSingle","pSub")])
biAllelicGain <- (info(vcf)$MajCN > 1 & (info(vcf)$MinCN > 1 | info(vcf)$MinCN == 0) & ! ((info(vcf)$MajCN == 1 | info(vcf)$MinCN == 1) & abs(info(vcf)$MutCN - info(vcf)$MutDeltaCN -1) <= 0.0001))
w <- which(biAllelicGain)
P[w, "pLate"] <- P[w, "pClonal[NA]"]
P[w, "pClonal[NA]"] <- 0
P[which(!biAllelicGain),"pLate"] <- 0
return(P)
}
piToTime <- function(timing_param, type=c("Mono-allelic Gain","CN-LOH", "Bi-allelic Gain (WGD)")){
type <- match.arg(type)
evo <- NA
w <- timing_param[,"s"]==1 &! timing_param[,"mixFlag"] ## Clonal states, not mixture (CN subclonal)
M <- sum(w) ## Max multiplicity = Major copy number
m <- sum(w & timing_param[,"n.m.s"]==2) ## Minor CN
#evo <- paste0("1:1", paste0(2,":",m), if(M>2) paste0(3,":",) else "", collapse="->")
t <- timing_param[(M:(M-1)),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE] ## Timing M and M-1
if(nrow(t)==1) t <- rbind(t, NA)
if(!any(is.na(t))){
if(M==4) {
if(timing_param[M-1,"P.m.sX"] < 0.02){ ## No MCN 3
if(type=="CN-LOH"){ ## Hotfix for 4+0, treated at 1:1 -> 2:0 -> 4:0
t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE]*c(1,0.5) ## Timing M and M-2
evo <- "1:1->2:0->4:0"
}
else if(type=="Bi-allelic Gain (WGD)"){
if(m==2) {## Hotfix for 4+2 regions, treated at 1:1 -> 2:1 -> 4:2
t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE] ## Timing M and M-2
t[2,] <- pmax(0,2*t[2,] - t[1,])/3
evo <- "1:1->2:1->4:2"
} else if(m==4) {## Hotfix for 4+4 regions, treated at 1:1 -> 2:2 -> 4:4
t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE]*c(1,0.5) ## Timing M and M-2
evo <- "1:1->2:2->4:4"
}
}
} else if(type=="Bi-allelic Gain (WGD)"){ ## Can't uniquely time second event
t[2,] <- NA
}
if(m==3) {
t[2,] <- NA ## Don'time secondary 4+3 for now, needs more work
}
} else {
if(M==3 & type=="Bi-allelic Gain (WGD)") {## Hotfix for 3+2 regions, treated at 1:1 -> 2:2 -> 3:2
t[2,] <- pmax(0,2*t[2,] - t[1,])
evo <- "1:1->2:2->3:2"
}
}
}
colnames(t) <- c("","lo","up")
t[,2] <- pmin(t[,1],t[,2])
t[,3] <- pmax(t[,1],t[,3])
t <- pmin(apply(t,2,cumsum),1) ## Times are actually deltas
if(M < 3) t[2,] <- NA
t[is.infinite(t)] <- NA
rownames(t) <- c("",".2nd")
return(c(t[1,],`.2nd`=t[2,])) ## Linearise
}
bbToTime <- function(bb, timing_param = bb$timing_param, pseudo.count=5){
sub <- duplicated(bb)
covrg <- countQueryHits(findOverlaps(bb, bb))
maj <- sapply(timing_param, function(x) if(length(x) > 0) x[1, "majCNanc"] else NA) #bb$major_cn
min <- sapply(timing_param, function(x) if(length(x) > 0) x[1, "minCNanc"] else NA) #bb$minor_cn
type <- sapply(seq_along(bb), function(i){
if(maj[i] < 2 | is.na(maj[i]) | sub[i] | (maj[i] > 4 & min[i] >= 2)) return(NA)
type <- if(min[i]==1){ "Mono-allelic Gain"
}else if(min[i]==0){"CN-LOH"}
else "Bi-allelic Gain (WGD)"
return(type)
})
time <- t(sapply(seq_along(bb), function(i){
if(sub[i] | is.na(type[i])) return(rep(NA,6))
else piToTime(timing_param[[i]],type[i])
}))
res <- data.frame(type=factor(type, levels=c("Mono-allelic Gain","CN-LOH","Bi-allelic Gain (WGD)")), time=time)
colnames(res) <- c("type","time","time.lo","time.up","time.2nd","time.2nd.lo","time.2nd.up")
# posthoc adjustment of CI's
res$time.up <- (pseudo.count + bb$n.snv_mnv * res$time.up)/(pseudo.count + bb$n.snv_mnv)
res$time.lo <- (0 + bb$n.snv_mnv * res$time.lo)/(pseudo.count + bb$n.snv_mnv)
res$time.2nd.up <- (pseudo.count + bb$n.snv_mnv * res$time.2nd.up)/(pseudo.count + bb$n.snv_mnv)
res$time.2nd.lo <- (0 + bb$n.snv_mnv * res$time.2nd.lo)/(pseudo.count + bb$n.snv_mnv)
res$time.star <- factor((covrg == 1) + (min < 2 & maj <= 2 | min==2 & maj==2) * (covrg == 1), levels=0:2, labels = c("*","**","***")) ## ***: 2+0, 2+1, 2+2; **: 2<n+1, {3,4}+2; *: subclonal gains
res$time.star[is.na(res$time)] <- NA
return(res)
}
simulateMutations <- function(bb, purity=max(bb$clonal_frequency, na.rm=TRUE), n=40, rho=0.01, xmin=3){
g <- (averagePloidy(bb)*purity + 2*(1-purity))
V <- list(VRanges())#VRanges()
for(i in which(!duplicated(bb)))
if(bb$n.snv_mnv[i]>1 & !is.null( bb$timing_param[[i]]))try({
cnStates <- bb$timing_param[[i]]
p <- cnStates[,"pi.s"]* if(!any(is.na(cnStates[,"P.m.sX"]))) cnStates[,"P.m.sX"] else cnStates[,"pi.m.s"]
pwr <- cnStates[,"power.m.s"]#(cnStates[,"power.s"] * cnStates[,"power.m.s"])
s <- sample(1:nrow(cnStates), size=pmax(1,ceiling(bb$n.snv_mnv[i] * (p %*% (1/pwr)))), prob=p, replace=TRUE)
f <- cnStates[s,"f"]
mu.c <- (bb$total_cn[i]*purity + 2*(1-purity))/g * n
c <- rnbinom(length(f), size=1/rho, mu=mu.c)
x <- rbetabinom(n=length(f), size=c, prob=f, rho=rho)
pos <- round(runif(length(f), min=start(bb)[i], max=end(bb)[i]))
w <- which(x>=xmin)
V[[i]] <- VRanges(seqnames=seqnames(bb)[i], IRanges(pos, width=1), ref="N", alt="A", totalDepth=c, altDepth=x)[w]
})
V <- do.call("c", V[!sapply(V, is.null)])
sampleNames(V) <- "SAMPLE"
v <- as(V, "VCF")
info(header(v)) <- rbind(info(header(v)),info(header(finalSnv[[1]]))[c("t_ref_count","t_alt_count"),])
info(v)$t_alt_count <- altDepth(V)
info(v)$t_ref_count <- totalDepth(V) - altDepth(V)
return(v)
}
All basic functions
# This file contains a range of convenience functions for running a
# range of timing analyses of the PCAWG-11 Evolution and Heterogeneity
# working group, detailed in PCAWG-final.R
#
# Author: Moritz Gerstung <moritz.gerstung@ebi.ac.uk>s
###############################################################################
library(Rsamtools)
library(VariantAnnotation)
vcfPath <- '../final/final_consensus_12oct_passonly/snv_mnv'
dpPath <- paste0('../final/consensus_subclonal_reconstruction_20170325')
purityPloidy <- read.table( '../final/consensus.20170218.purity.ploidy.txt', header=TRUE, row.names=1)
allGender <- read.table('../final/2016_12_09_inferred_sex_all_samples.txt', header=TRUE, sep='\t')
allGender <- allGender[!duplicated(allGender$tumourid) & allGender$tumourid != 'tumourid',]
#write.table(gender,'../gender/2016_12_09_inferred_sex_all_samples_CORRECTED_MG.txt', row.names=FALSE, col.names=TRUE, sep='\t', quote=FALSE)
rownames(allGender) <- allGender$tumourid
addTNC <- function(vcf){
r = "../ref/genome.fa.gz" #meta(header(v))["reference",]
if(!"TNC" %in% rownames(header(vcf)@header$INFO)){
tnc=scanFa(file=r, resize(granges(vcf), 3,fix="center"))
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="String",Description="Trinucleotide context", row.names="TNC"))
info(vcf)$TNC <- as.character(tnc)
}
return(vcf)
}
dpFiles <- dir(dpPath, pattern="_subclonal_structure.txt", recursive=TRUE)
bbFiles <- dir(bbPath, pattern="_segments.txt", recursive=TRUE)
wccTable <- read.table("../final/wcc_consensus_values_9_12.tsv", header=TRUE, sep='\t')
d <- data.frame(cluster=wccTable$sc_n, n_ssms=wccTable$consensus_mutation_number, proportion = round(wccTable$consensus_cluster_cp,3))
d[d$cluster==1,"proportion"] <- wccTable[d$cluster==1,"purity"]
wccClusters <- split(d, wccTable$sid)
wccPurity <- d[d$cluster==1,"proportion"]
#plot(wccPurity, purityPloidy[,"purity"])
loadClusters <- function(ID){
file <- paste0(dpPath,"/",grep(paste0(ID,"[[:punct:]]"), dpFiles, value=TRUE, perl=TRUE))
if(grepl(".gz", file))
file <- gzfile(file)
read.table(file, header=TRUE, sep="\t")
}
loadConsensusClusters <- function(ID){
file <- paste0(dpPath,"/",grep(paste0(ID,"[[:punct:]]"), dpFiles, value=TRUE, perl=TRUE))
if(grepl(".gz", file))
file <- gzfile(file)
read.table(file, header=TRUE, sep="\t")
}
consensusClustersToOld <- function(clusters){
data.frame(cluster=clusters$cluster, n_ssms=clusters$n_snvs, proportion=clusters$fraction_total_cells)
}
parseRegion <- function(regions){
chr <- regmatches(regions, regexpr("^.+(?=:)",regions,perl=TRUE))
start <- as.numeric(regmatches(regions, regexpr("(?<=:)[0-9]+(?=-)",regions,perl=TRUE)))
end <- as.numeric(regmatches(regions, regexpr("(?<=-)[0-9]+$",regions,perl=TRUE)))
data.frame(chr,start,end)
}
loadConsensusCNA <- function(ID, purity=1, path="../final/consensus.20170119.somatic.cna.annotated"){
file <- grep(paste0(ID,"[[:punct:]]"), dir(path, pattern="cna.annotated.txt", recursive=TRUE, full.names=TRUE), value=TRUE)
if(grepl(".gz", file))
file <- gzfile(file)
tab <- read.table(file, header=TRUE, sep='\t')
subclonalIndex <- !is.na(tab$total_cn) & !is.na(tab$battenberg_nMaj2_A) & !is.na(tab$battenberg_nMin2_A) & !is.na(tab$battenberg_frac2_A) & (tab$battenberg_nMaj1_A == tab$major_cn & tab$battenberg_nMin1_A == tab$minor_cn | tab$battenberg_nMaj2_A == tab$major_cn & tab$battenberg_nMin2_A == tab$minor_cn)
ix <- c(1:nrow(tab), which(subclonalIndex))
gr <- GRanges(tab$chromosome, IRanges(tab$start, tab$end), strand="*", clonal_frequency=purity, tab[-3:-1])[ix]
if(any(subclonalIndex)){
gr$clonal_frequency[which(subclonalIndex)] <- tab$battenberg_frac1_A[subclonalIndex] * purity
gr$major_cn[which(subclonalIndex)] <- tab$battenberg_nMaj1_A[subclonalIndex]
gr$minor_cn[which(subclonalIndex)] <- tab$battenberg_nMin1_A[subclonalIndex]
gr$total_cn[which(subclonalIndex)] <- tab$battenberg_nMaj1_A[subclonalIndex] + tab$battenberg_nMin1_A[subclonalIndex]
gr$clonal_frequency[-(1:nrow(tab))] <- tab$battenberg_frac2_A[subclonalIndex] * purity
gr$total_cn[-(1:nrow(tab))] <- tab$battenberg_nMaj2_A[subclonalIndex] + tab$battenberg_nMin2_A[subclonalIndex]
gr$major_cn[-(1:nrow(tab))] <- tab$battenberg_nMaj2_A[subclonalIndex]
gr$minor_cn[-(1:nrow(tab))] <- tab$battenberg_nMin2_A[subclonalIndex]
}
seqlevels(gr) <- c(1:22, "X","Y")
sort(gr)
}
refFile = "../ref/genome.fa.gz" #meta(header(v))["reference",]
refLengths <- scanFaIndex(file=refFile)
chrOffset <- cumsum(c(0,as.numeric(width(refLengths))))
names(chrOffset) <- c(seqlevels(refLengths), "NA")
averagePloidy <- function(bb) {
c <- if(!is.null(bb$copy_number)) bb$copy_number else bb$total_cn
sum(width(bb) * c * bb$clonal_frequency, na.rm=TRUE) / sum(width(bb) * bb$clonal_frequency, na.rm=TRUE)
}
averageEvenPloidy <- function(bb){
sum(width(bb) * (!bb$major_cn %% 2 & !bb$minor_cn %% 2) * bb$clonal_frequency, na.rm=TRUE) / sum(width(bb) * bb$clonal_frequency, na.rm=TRUE)
}
getTumorCounts <- function(vcf){
sapply(grep("(F|R).Z", names(geno(vcf)), value=TRUE), function(i) geno(vcf)[[i]][,"TUMOUR"])
}
getTumorDepth <- function(vcf){
if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
info(vcf)$t_alt_count + info(vcf)$t_ref_count
}else{ ## older data
if("FAZ" %in% rownames(geno(header(vcf)))){
rowSums(getTumorCounts(vcf))
}else{
geno(vcf)$DEP[,2]
}
}
}
getAltCount <- function(vcf){
if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
info(vcf)$t_alt_count
}else{ ## older formats
if("FAZ" %in% rownames(geno(header(vcf)))){ ## ie subs
c <- getTumorCounts(vcf)
t <- c[,1:4] + c[,5:8]
colnames(t) <- substring(colnames(t),2,2)
a <- as.character(unlist(alt(vcf)))
a[!a%in%c('A','T','C','G')] <- NA
sapply(seq_along(a), function(i) if(is.na(a[i])) NA else t[i, a[i]])
}
else{ ## ie indel
#(geno(vcf)$PP + geno(vcf)$NP + geno(vcf)$PB + geno(vcf)$NB)[,"TUMOUR"]
geno(vcf)$MTR[,2]
}
}
}
mergeClusters <- function(clusters, deltaFreq=0.05){
if(nrow(clusters) <= 1) return(clusters)
h <- hclust(dist(clusters$proportion), members=clusters$n_ssms)
ct <- cutree(h, h=deltaFreq)
cp <- as.matrix(cophenetic(h))
Reduce("rbind",lapply(unique(ct), function(i) {
n_ssms <- sum(clusters$n_ssms[ct==i])
w <- max(cp[ct==i,ct==i])
data.frame(new.cluster=i, n_ssms=n_ssms, proportion=sum(clusters$proportion[ct==i]*clusters$n_ssms[ct==i])/n_ssms, width=w)
}
))
}
removeSuperclones <- function(clusters, min.frac=0.1, delta.prop=0.1) {
m <- which(clusters$proportion == max(clusters$proportion[clusters$n_ssms >= 0.1 * sum(clusters$n_ssms)]))
w <- clusters$proportion >= clusters$proportion[m] - delta.prop
if(sum(w)>1){
cl <- as.data.frame(rbind(if(any(!w)) clusters[!w,,drop=FALSE], if(any(w)) colSums(clusters[w,,drop=FALSE]*(clusters[w,"n_ssms"]/sum(clusters[w,"n_ssms"])))))
cl[nrow(cl),"n_ssms"] <- sum(clusters[w,"n_ssms"])
clusters <- cl
}
return(clusters)
}
clustersFromBB <- function(bb){
w <- bb$clonal_frequency == max(bb$clonal_frequency, na.rm=TRUE) | bb$clonal_frequency < 0.5 * max(bb$clonal_frequency, na.rm=TRUE)
t <- table(bb$clonal_frequency[w])
cl <- data.frame(cluster=seq_along(t), n_ssms=as.numeric(t), proportion=as.numeric(names(t)))
mergeClusters(cl, deltaFreq=0.2)
}
mergeClustersByMutreadDiff = function(clusters, purity, ploidy, vcf_snv, min_read_diff) {
calc_exp_mutreads_ccf = function(ccf, purity, ploidy, mean_depth) {
return(ccf / (purity*ploidy + (1-purity)*2) * mean_depth)
}
clusters_new = clusters
exp_reads = sapply(clusters$ccf, calc_exp_mutreads_ccf, purity=purity, ploidy=ploidy, mean_depth=mean(getTumorDepth(vcf_snv), na.rm=T))
ccf_diff = exp_reads[1:(length(exp_reads)-1)] - exp_reads[2:length(exp_reads)]
if (any(ccf_diff < min_read_diff)) {
# Iteratively merge a pair of clusters untill no more pairs within distance can be found
merged = T
while(merged) {
merged = F
exp_reads = sapply(clusters_new$ccf, calc_exp_mutreads_ccf, purity=purity, ploidy=ploidy, mean_depth=mean(getTumorDepth(vcf_snv), na.rm=T))
ccf_diff = exp_reads[1:(length(exp_reads)-1)] - exp_reads[2:length(exp_reads)]
to_merge = which(ccf_diff < min_read_diff)
if (length(to_merge)==0) {
merged = F
break
} else {
i = to_merge[1]
clusters_new$ccf[i] = sum(clusters_new$ccf[c(i, i+1)]*clusters_new$n_ssms[c(i, i+1)]) / sum(clusters_new$n_ssms[c(i, i+1)])
clusters_new$n_ssms[i] = sum(clusters_new$n_ssms[c(i, i+1)])
clusters_new = clusters_new[-(i+1),]
merged = T
}
if (nrow(clusters_new)==1) {
merged = F
break
}
}
}
clusters_new$proportion = clusters_new$ccf * purity
# sorting and renumbering clusters
clusters_new = clusters_new[with(clusters_new, order(proportion, decreasing=T)),]
clusters_new$cluster = 1:nrow(clusters_new)
return(clusters_new)
}
probGenotype <- function(vcf){
dg <- factor(paste(unlist(info(vcf)$DG)), levels=c("NA",as.character(CANCERGENES)))
P <- pGainToTime(vcf)
G <- matrix(0, ncol=5, nrow=nlevels(dg), dimnames=list(levels(dg), c(colnames(P),"NA")))
t <- table(dg)
for(g in names(t[t>0]))
G[g,] <- c(colSums(P[dg==g,,drop=FALSE],na.rm=TRUE), "NA"=sum(is.na(P[dg==g,1])))
return(G)
}
probGenotypeTail <- function(vcf){
dg <- factor(paste(unlist(info(vcf)$DG)), levels=c("NA",as.character(CANCERGENES)))
P <- info(vcf)$pMutCNTail
G <- rep(NA, nlevels(dg))
names(G) <- levels(dg)
t <- table(dg)
for(g in names(t[t>0]))
G[g] <- mean(P[dg==g,drop=FALSE],na.rm=TRUE)
return(G)
}
getGenotype <- function(vcf, reclassify='missing', ...){
w <- c(TRUE,diff(start(vcf)) != 1)
cls <- classifyMutations(vcf, reclassify=reclassify)
t <- info(vcf)$TCN
if(is.null(t))
t <- info(vcf)$MinCN + info(vcf)$MajCN
hom <- factor(info(vcf)$MutCN==t, levels=c(TRUE,FALSE))
dg <- factor(unlist(info(vcf)$DG), levels=as.character(CANCERGENES))
table(gene=dg[w], class=cls[w], homozygous=hom[w], ...)
}
tryExceptNull <- function(x) if(class(x)=="try-error") GRanges() else x
loadVcf <- function(ID){
file <- dir(vcfPath, pattern=paste0(ID, ".+somatic.snv_mnv.TNC.vcf.bgz$"), full.names=TRUE)
pos <- loadPositions(ID)
vcf <- readVcf(file, genome="GRCh37") #, param=ScanVcfParam(which=pos))
f <- findOverlaps(pos, vcf, select="first")
vcf <- vcf[na.omit(f)]
vcf <- addDriver(vcf, CANCERGENES)
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="Numeric",Description="DP cluster", row.names="DPC"))
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="Numeric",Description="DP cluster probability", row.names="DPP"))
info(vcf)$DPC <- pos$cluster[!is.na(f)]
info(vcf)$DPP <- pos$likelihood[!is.na(f)]
vcf
}
isDeamination <- function(vcf) grepl("(A.CG)|(T..CG)", paste(as.character(unlist(alt(vcf))),vcf@info$TNC))
isDeaminationNoUV <- function(vcf) grepl("(A.CG[C,T])|(T.[A,G]CG)", paste(as.character(unlist(alt(vcf))),vcf@info$TNC))
testDriver <- function(vcf) sapply(info(vcf)$VC, function(x) if(length(x) ==0) FALSE else any(x %in% c('nonsense','missense','ess_splice','frameshift','inframe','cds_distrupted')))
addDriver <- function(vcf, mutsigDrivers){
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="String",Description="Driver gene", row.names="DG"))
if(nrow(vcf)==0){
info(vcf)$DG <- CharacterList()
return(vcf)
}
r <- paste(paste0("^",mutsigDrivers,"(?=\\|)"), collapse="|")
rowIdx <- grepl(r, info(vcf)$VD, perl=TRUE) & testDriver(vcf)
g <- sapply(info(vcf)$VD, function(x) sub("\\|.+","", x))
d <- ifelse(rowIdx,g,character(0))
#d[is.na(d)] <- character(0)
info(vcf)$DG <- as(d,"CharacterList")
vcf
}
loadAssignment <- function(ID){
file <- paste0(dpPath,"/",ID,"_DPoutput_1250iters_250burnin/",ID,"_DP_and_cluster_info.txt")
read.table(file, header=TRUE, sep="\t")
}
addAssignment <- function(vcf, ID){
a <- loadAssignment(ID)
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=ncol(a),Type="Numeric",Description="DP probability", row.names="DPP"))
info(vcf)$DPP <- as.matrix(a)
vcf
}
loadPositions <- function(ID){
file <- gzfile(paste0(dpPath,"/",ID,"_mutation_assignments.txt.gz"))
tmp <- read.table(file, header=TRUE, sep="\t")
#GRanges(tmp$chr, IRanges(tmp$start+1, tmp$end), cluster=tmp$cluster, likelihood=tmp$likelihood)
GRanges(tmp$chr, IRanges(tmp$pos, width=1), cluster=tmp$cluster)
}
tncToPyrimidine <- function(vcf){
a <- unlist(alt(vcf))
r <- ref(vcf)
tnc <- DNAStringSet(info(vcf)$TNC)
rc <- grepl("A|G", r)
tnc[rc] <- reverseComplement(tnc[rc])
a[rc] <- reverseComplement(a[rc])
t <- paste0(substr(tnc,1,1), "[",substr(tnc,2,2), ">",a, "]", substr(tnc,3,3))
n <- c("A","C","G","T")
f <- paste0(rep(n, each=4), "[", rep(c("C","T"), each=96/2), ">", c(rep(c("A","G","T"), each=48/3), rep(c("A","C","G"), each=48/3)), "]", n)
factor(t, levels=f)
}
applyPigeonHole <- function(ID){
c <- loadClusters(ID)
p <- purityPloidy[ID,"purity"]
mcf <- c$proportion#*p
l <- sapply(1:length(mcf), function(i) mcf[i] > pmax(0,1-mcf))
w <- which(l & upper.tri(l), arr.ind=TRUE)
cbind(c$cluster[w[,1]], c$cluster[w[,2]])
}
reduceToCoverRelations <- function(rel){
if(nrow(rel) ==0) return(rel)
n <- max(rel)
P <- matrix(FALSE,n,n)
for(i in 1:nrow(rel))
P[rel[i,1],rel[i,2]] <- TRUE
for(i in 1:n){
q <- list()
visit <- logical(n)
for(j in 1:n)
if(P[i,j])
q <- c(q,j)
while(length(q)>0){
j <- q[[1]]
q[[1]] <- NULL
for(k in 1:n){
if(P[j,k] & !visit[k]){
visit[k] <- TRUE
q <- c(q,k)
if(P[i,k])
P[i,k] <- FALSE
if(P[k,i])
stop("Error.")
}
}
}
}
which(P, arr.ind=TRUE)
}
cnWeights <- function(vcf){
t <- if(is.null(info(vcf)$TCN)) (info(vcf)$MajCN + info(vcf)$MinCN) else info(vcf)$TCN
info(vcf)$MutCN / t
}
branchLengths <- function(vcf, type=c("all","deamination")){
type <- match.arg(type)
if(type=="deamination")
w <- isDeamination(vcf)
else
w <- rep(TRUE, nrow(vcf))
cnw <- cnWeights(vcf)
u <- allClusters[[meta(header(vcf))$META["ID",1]]]$cluster
b <- sapply(u, function(c) sum(2*cnw * (info(vcf)$DPC==c & w), na.rm=TRUE))
#if(length(b)==0) b <- rep(0, length(u))
names(b) <- u
return(b)
}
avgWeights <- function(vcf, type=c("all","deamination")){
type <- match.arg(type)
if(type=="deamination")
w <- isDeamination(vcf)
else
w <- rep(TRUE, nrow(vcf))
cnw <- cnWeights(vcf)
mean(2*cnw[w], na.rm=TRUE)
}
predictRealTime <- function(x, signatures=snmf$snmfFit$P[-1,]){
snmf$snmfFit$P[1,1] / snmf$weight * snmf$nmSolve(x, signatures, maxIter=100)[1,]
}
#' ### Compute graphs (posets)
toGraph <- function(edgelist, branch.length, edge.labels, node.labels=1:max(edgelist)){
g <- graph.edgelist(edgelist)
E(g)$weight <- branch.length
E(g)$name <- edge.labels
V(g)$name <- node.labels
return(g)
}
na.rm <- function(x) x[!is.na(x)]
plotPoset <- function(g){
c <- colorRampPalette(brewer.pal(9, "Spectral"))(10)
plot(g, layout=layout.reingold.tilford(g), edge.label=E(g)$name, vertex.size = 25*pmin(1,V(g)$size), edge.width=E(g)$weight/100)
}
#' Distance in poset
posetDist <- function(g) {
e <- get.edgelist(g)
w <- c(0,E(g)$weight)
names(w) <- c("Germline", e[,2])
d <- shortest.paths(g, mode='out')
d <- d - rep(w[colnames(d)], each=length(w))/2
diag(d) <- NA
d
}
getMutationCluster <- function(allMutations, vcf){
m <- match(allMutations, unlist(info(vcf)$DG))
info(vcf)$DPC[m]
}
distAsRange <- function(g){
e <- get.edgelist(g)
w <- c(0,E(g)$weight)
names(w) <- c("Germline", e[,2])
d <- shortest.paths(g, mode='out')
y <- shift(IRanges(-w[colnames(d)],0), d["Germline", ])
names(y) <- paste0(colnames(d), ", genes=",E(g)$name[match(colnames(d), e[,2])])
y
}
nmSolve <- function(D, P, maxIter = 500, tol=1e-3) {
n <- nrow(D)
m <- ncol(D)
s <- ncol(P)
tP <- t(P)
rP <- rep(colSums(P), m)
D <- as.matrix(D)
E1 <- E2 <- matrix(runif(s * m, 1e-3, 1), ncol = m)
err <- 2*tol
iter <- 1
while (iter < maxIter & err > tol) {
E1 <- E2
E2 <- E1 * (tP %*% (D/(P %*% (E1 + .Machine$double.eps))))/rP
iter <- iter + 1
err <- mean(abs(E2 - E1)/(E1+.Machine$double.eps), na.rm=TRUE)
if(iter %% 100 == 0) cat(round(-log10(err)))
}
cat("\n")
if(iter == maxIter) warning(paste("No convergence after",iter, "iterations."))
E2
}
wnmSolve <- function(D, P, weights = rep(0, ncol(P)), maxIter = 500, tol=1e-3) {
D <- as.matrix(D)
D <- rbind(D, matrix(weights, ncol=ncol(D), nrow=ncol(P)))
P <- rbind(P, diag(rep(1,ncol(P))))
n <- nrow(D)
m <- ncol(D)
s <- ncol(P)
tP <- t(P)
rP <- rep(colSums(P), m)
E1 <- E2 <- matrix(runif(s * m, 1e-3, 1), ncol = m)
err <- 2*tol
iter <- 1
while (iter < maxIter & err > tol) {
E1 <- E2
E2 <- E1 * (tP %*% (D/(P %*% (E1 + .Machine$double.eps))))/rP
iter <- iter + 1
err <- mean(abs(E2 - E1)/(E1+.Machine$double.eps), na.rm=TRUE)
if(iter %% 100 == 0) cat(round(-log10(err)))
}
cat("\n")
if(iter == maxIter) warning(paste("No convergence after",iter, "iterations."))
E2
}
wgdTest <- function(vcf){
id <- meta(header(vcf))$META["ID",1]
bb <- allBB[[id]]
ix <- which(bb$copy_number==4 & bb$minor_cn==2)
v <- vcf[vcf %over% bb[ix]]
#w <- sum(as.numeric(width(reduce(bb[ix]))))
t <- table(info(v)$MutCN, info(v)$TCN, as.character(seqnames(v)), info(v)$DPC)
}
#' Power
power <- function(f,n, theta=6.3, err=1e-4) if(any(is.na(c(f,n,theta,err)))) NA else sum((log10(dbinom(0:n, n, 0:n/n) / dbinom(0:n,n,err)) > theta) * dbinom(0:n,n,f))
testIndel <- function(vcf) sapply(info(vcf)$VC, function(x) if(length(x) ==0) FALSE else any(x %in% c('frameshift','inframe','ess_splice','SO:0001577:complex_change_in_transcript', 'SO:0001582:initiator_codon_change', 'splice_region')))
asum <- function(x, dim) apply(x, setdiff(seq_along(dim(x)), dim), sum)
#' official driver file
#d <- lapply(2:3, function(sheet) xlsx::read.xlsx2(file="../ref/TableS2_driver_point_mutations_annotation.xlsx", sheetIndex=sheet, colIndex=1:22, stringsAsFactors=FALSE, na.char="NaN"))
#colnames(d[[2]]) <- colnames(d[[1]])
#drivers <- do.call("rbind",d)
#drivers[drivers=="NaN" | drivers==""] <- NA
#drivers <- as.data.frame(sapply(drivers, function(x) if(all(!is.na(as.numeric(x[!is.na(x)])))) as.numeric(x) else x, simplify=FALSE))
finalData <- read.table("../final/ref/release_may2016.v1.4.tsv", header=TRUE, sep="\t")
#r <- gsub("-","",drivers$ref)
#i <- grepl("-",drivers$ref) | grepl("-",drivers$alt) #drivers$mut_type=="indel" # need to fix indels
#r[i] <- paste0("N",r[i])
#a <- gsub("-","",drivers$alt)
#a[i] <- paste0("N",a[i])
#p <- drivers$pos
#p[i & !grepl("-", drivers$ref)] <- p[i & !grepl("-", drivers$ref)]-1
#m <- sapply(levels(drivers$sample), function(x) grep(x, finalData$sanger_variant_calling_file_name_prefix))
#finalDrivers <- VRanges(seqnames = drivers$chr, ranges=IRanges(p, width = width(r)), ref=DNAStringSet(r), alt=DNAStringSet(a), sampleNames = finalData$icgc_donor_id[m[drivers$sample]])
#mcols(finalDrivers) <- cbind(sample=drivers$sample, samples=finalData$sanger_variant_calling_file_name_prefix[m[drivers$sample]], ID= drivers$gene_id, drivers[,8:22], mut_type=ifelse(i, "indel","snv"))
#save(finalDrivers, file="../ref/TableS2_driver_point_mutations_annotation.RData")
load(file="../ref/TableS2_driver_point_mutations_annotation.RData")
CANCERGENES <- levels(finalDrivers$ID)
matchDrivers <- function(vcf, finalDrivers) {
ID <- meta(header(vcf))$META["ID",1]
d <- finalDrivers[grep(ID, finalDrivers$samples)]
g <- factor(rep(NA,nrow(vcf)), levels = levels(d$ID))
if(length(d)==0)
return(g)
overlaps <- findOverlaps(vcf, d)
g[queryHits(overlaps)] <- d$ID[subjectHits(overlaps)]
return(g)
}
addFinalDriver <- function(vcf, finalDrivers){
i = header(vcf)@header$INFO
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="String",Description="Driver mutation", row.names="DG"))
info(vcf)$DG <- factor(rep(NA,nrow(vcf)), levels = levels(finalDrivers$ID))
if(nrow(vcf)==0)
return(vcf)
g <- matchDrivers(vcf = vcf, finalDrivers = finalDrivers)
info(vcf)$DG <- g
return(vcf)
}
clinicalData <- read.table("../ref/pcawg_donor_clinical_August2016_v9.tsv", header=TRUE, sep="\t", comment="", quote="")
load("../ref/Sarcs_ages.RDa")
for(x in Sarcs_age)
clinicalData$donor_age_at_diagnosis[match(as.character(x$icgc_donor_id), as.character(clinicalData$icgc_donor_id))] <- as.numeric(x$Age)
rm(Sarcs_age)
specimenData <- read.table("../ref/pcawg_specimen_histology_August2016_v9.tsv", header=TRUE, sep="\t", comment="", quote="")
specimenData$histology_abbreviation <- sub("CA$","Ca",specimenData$histology_abbreviation)
s <- strsplit(as.character(finalData$sanger_variant_calling_file_name_prefix),",")
sample2donor <- as.character(finalData$icgc_donor_id[unlist(sapply(seq_along(s), function(i) rep(i, length(s[[i]]))))])
names(sample2donor) <- unlist(s)
s <- unlist(strsplit(as.character(finalData$sanger_variant_calling_file_name_prefix),","))
sample2icgc <- unlist(strsplit(as.character(finalData$tumor_wgs_icgc_sample_id),",")) #as.character(finalData$tumor_wgs_icgc_sample_id[unlist(sapply(seq_along(s), function(i) rep(i, length(s[[i]]))))])
names(sample2icgc) <- s#unlist(s)
donor2type <- factor(specimenData$histology_abbreviation, levels=c(sort(unique(specimenData$histology_abbreviation))[-1], ""))
donor2type <- as.character(donor2type)
donor2type[donor2type=="Kidney-RCC" & grepl("clear cell", specimenData$histology_tier4)] <- "Kidney-CCRCC"
donor2type[donor2type=="Kidney-RCC" & grepl("papillary",specimenData$histology_tier4)] <- "Kidney-PapRCC"
donor2type <- factor(donor2type)
names(donor2type) <- specimenData$icgc_donor_id
levels(donor2type)[levels(donor2type)==""] <- "Other/NA"
t <- read.table("../ref/tumour_subtype_consolidation_map.tsv - Unique List of Tumour Types_August.tsv", sep='\t', header=TRUE, comment="")
c <- as.character(t$`Color..RGB.code.`)
names(c) <- sub("CA$","Ca",t$`Abbreviation`)
c <- c[c != "" & !duplicated(names(c))]
tissueColors <- c(table(donor2type))*NA
tissueColors[names(c)] <- c
tissueColors["Lymph-CLL"] <- "#F4A35D"
tissueColors["Kidney-CCRCC"] <- tissueColors["Kidney-RCC"]
tissueColors["Kidney-PapRCC"] <- "#E53E00"
tissueColors <- tissueColors[levels(donor2type)]
tissueBorder <- c("white","black")[names(tissueColors) %in% c("Lung-SCC","Lung-AdenoCa")+1]
names(tissueBorder) <- names(tissueColors)
tissueLines <- tissueColors
tissueLines[names(tissueColors) %in% c("Lung-SCC","Lung-AdenoCa")] <- "black"
tissueLty <- c(1,1)[names(tissueColors) %in% c("Lung-SCC","Lung-AdenoCa")+1]
names(tissueLty) <- names(tissueColors)
tissueLty["Lung-SCC"] <- 5
tissueLty["Lung-AdenoCa"] <- 4
tissueCex <- tissueLty
tissueCex[grep("Lung", names(tissueCex))] <- 0.8
averageHom <- function(bb){
sum(width(bb) * (bb$minor_cn == 0) * bb$clonal_frequency, na.rm=TRUE) / sum(width(bb) * bb$clonal_frequency, na.rm=TRUE)
}
.classWgd <- function(ploidy, hom) 2.9 -2*hom <= ploidy
classWgd <- function(bb) .classWgd(averagePloidy(bb), averageHom(bb))
plotBB <- function(bb, ylim=c(0,max(max(bb$total_cn, na.rm=TRUE))), col=RColorBrewer::brewer.pal(4,"Set2"), type=c("lines","bars"), legend=TRUE, lty.grid=1, col.grid="grey", xaxt=TRUE, xlim=c(min(chrOffset[as.character(seqnames(bb))]+start(bb)),max(chrOffset[as.character(seqnames(bb))]+end(bb))), add=FALSE){
type <- match.arg(type)
s <- c(1:22, "X","Y")
l <- as.numeric(width(refLengths[seqnames(refLengths) %in% s]))
names(l) <- s
c <- cumsum(l)
if(add==FALSE){
plot(NA,NA, ylab="Copy number",xlab="",xlim=xlim, ylim=ylim, xaxt="n")
axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
if(length(regions) == 1)
if(xaxt) axis(side=1, at=pretty(c(start(regions), end(regions)))+chrOffset[as.character(seqnames(regions))], labels=sitools::f2si(pretty(c(start(regions), end(regions)))), las=1)
if(xaxt) mtext(side=1, at= cumsum(l) - l/2, text=names(l), line=1)
}
if(type=="lines"){
x0 <- start(bb) + cumsum(l)[as.character(seqnames(bb))] - l[as.character(seqnames(bb))]
x1 <- end(bb) + cumsum(l)[as.character(seqnames(bb))] - l[as.character(seqnames(bb))]
lwd <- 5* bb$clonal_frequency / max(bb$clonal_frequency)
segments(x0=x0, bb$major_cn ,x1, bb$major_cn, col=col[1], lwd=lwd)
segments(x0=x0, bb$minor_cn -.125,x1, bb$minor_cn-.125, col=col[2], lwd=lwd)
segments(x0=x0, bb$total_cn+.125,x1, bb$total_cn+.125, col=1, lwd=lwd)
# cv <- coverage(bb)
# cv <- cv[s[s%in%names(cv)]]
# par(xpd=NA)
# for(n in names(cv)){
# cc <- cv[[n]]
# segments(start(cc) + cumsum(l)[n] - l[n] ,-runValue(cc)/2,end(cc)+ cumsum(l)[n] - l[n], -runValue(cc)/2, col=4)
# }
}else{
ub <- unique(bb)
f <- findOverlaps(ub,bb)
m <- t(model.matrix( ~ 0 + factor(queryHits(f))))
ub$total_cn <- m %*% mg14::na.zero(bb$total_cn * bb$clonal_frequency) / max(bb$clonal_frequency)
ub$major_cn <- m %*% mg14::na.zero(bb$major_cn * bb$clonal_frequency) / max(bb$clonal_frequency)
ub$minor_cn <- m %*% mg14::na.zero(bb$minor_cn * bb$clonal_frequency) / max(bb$clonal_frequency)
ub$clonal_frequency <- max(bb$clonal_frequency)
x0 <- start(ub) + cumsum(l)[as.character(seqnames(ub))] - l[as.character(seqnames(ub))]
x1 <- end(ub) + cumsum(l)[as.character(seqnames(ub))] - l[as.character(seqnames(ub))]
rect(x0,0,x1, ub$major_cn, col=col[2], lwd=NA)
rect(x0,ub$major_cn,x1, ub$total_cn, col=col[1], lwd=NA)
abline(h = 1:floor(ylim[2]), lty=lty.grid, col=col.grid)
}
abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
#if(xaxt) mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
if(legend){
if(type=="lines") legend("topleft", c("Total CN","Major CN","Minor CN"), col=c("black", col[1:2]), lty=1, lwd=2, bg='white')
else legend("topleft", c("Major CN","Minor CN"), fill=col[1:2], bg='white')
}
}
plotVcf <- function(vcf, bb, clusters, col = RColorBrewer::brewer.pal(9, "Set1")[c(3,4,2,1,9)], ID = meta(header(vcf))[[1]]["ID",1], IS_WGD=classWgd(bb), NO_CLUSTER=FALSE, title=TRUE, legend=TRUE, lty.grid=1, col.grid="grey", xaxt=TRUE, pch=16, pch.out=pch, cex=0.66, xlim=c(0,chrOffset["MT"])) {
cls <- factor(paste(as.character(info(vcf)$CLS)), levels = c("clonal [early]","clonal [late]","clonal [NA]","subclonal" , "NA"))
plot(NA,NA, xlab='', ylab="VAF", ylim=c(0,1), xlim=xlim, xaxt="n", cex=cex)
if(title){
title(main=paste0(ID,", ", donor2type[sample2donor[ID]], "\nploidy=",round(averagePloidy(bb),2), ", hom=",round(averageHom(bb),2), if(IS_WGD) ", WGD" else "", if(NO_CLUSTER) ", (No clusters available)" else(paste0(", clusters=(",paste(round(clusters$proportion, 2), collapse="; "),")"))), font.main=1, line=1, cex.main=1)
}
abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
if(xaxt) mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
for(i in which(!sapply(bb$timing_param, is.null))) {
s <- start(bb)[i]
e <- end(bb)[i]
x <- chrOffset[as.character(seqnames(bb)[i])]
y <- bb$timing_param[[i]][,"f"]
l <- bb$timing_param[[i]][,"pi.s"] * bb$timing_param[[i]][,"P.m.sX"]
l[is.na(l)] <- 0
if(any(is.na(c(s,e,x,y,l)))) next
segments(s+x,y,e+x,y, lwd=l*4+.1)
#text(x=(s+e)/2 +x, y=y, paste(signif(bb$timing_param[[i]][,"m"],2),signif(bb$timing_param[[i]][,"cfi"]/purityPloidy[meta(header(vv))["ID",1],"purity"],2), sep=":"), pos=3, cex=0.5)
}
points(start(vcf) + chrOffset[as.character(seqnames(vcf))], getAltCount(vcf)/getTumorDepth(vcf),col=col[cls], pch=ifelse(info(vcf)$pMutCNTail < 0.025 | info(vcf)$pMutCNTail > 0.975, pch.out , pch), cex=cex)
if(legend) legend("topleft", pch=19, col=col, legend=paste(as.numeric(table(cls)), levels(cls)), bg='white')
}
timeToBeta <- function(time){
mu <- time[,1]
#if(any(is.na(time))) return(c(NA,NA))
mu <- pmax(1e-3, pmin(1 - 1e-3, mu))
v <- (0.5 * (pmax(mu,time[,3])-pmin(mu,time[,2])))^2
alpha <- mu * (mu * (1-mu) / v - 1)
beta <- (1-mu) * (mu * (1-mu) / v - 1)
return(cbind(alpha, beta))
}
plotTiming <- function(bb, time=mcols(bb), col=paste0(RColorBrewer::brewer.pal(5,"Set2")[c(3:5)],"88"), legend=TRUE, col.grid='grey', lty.grid=1, xlim=c(0,chrOffset["MT"]), plot=2){
plot(NA,NA, xlab='', ylab="Time [mutations]", ylim=c(0,1), xlim=xlim, xaxt="n")
if(any(!is.na(bb$time)))
tryCatch({
bb <- bb[!is.na(bb$time)]
s <- start(bb)
e <- end(bb)
x <- chrOffset[as.character(seqnames(bb))]
y <- time[,"time"]
rect(s+x,time[,"time.lo"],e+x,time[,"time.up"], border=NA, col=col[time[,"type"]], angle = ifelse(bb$time.star=="*" | is.na(bb$time.star),45,135), density=ifelse(bb$time.star == "***", -1, 72))
segments(s+x,y,e+x,y)
if("time.2nd" %in% colnames(time)){
w <- !is.na(time[,"time.2nd"])
if(sum(w) != 0 & plot==2){
s <- start(bb)[w]
e <- end(bb)[w]
x <- chrOffset[as.character(seqnames(bb))][w]
y <- time[w,"time.2nd"]
rect(s+x,time[w,"time.2nd.lo"],e+x,time[w,"time.2nd.up"], border=NA, col=sub("88$","44",col)[as.numeric(time[w,"type"])], angle = ifelse(bb$time.star[w]=="*" | is.na(bb$time.star[w]),45,135), density=ifelse(bb$time.star[w] == "***", -1, 72))
segments(s+x,y,e+x,y)
}
}
}, error=function(x) warning(x))
abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
s <- c(1:22, "X","Y")
l <- as.numeric(width(refLengths[seqnames(refLengths) %in% s]))
names(l) <- s
c <- cumsum(l)
axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
if(legend) legend("topleft", levels(time[,"type"]), fill=col, bg="white")
}
source("../modules/MutationTime.R/MutationTime.R")
findMainCluster <- function(bb, min.dist=0.05){
w <- which(bb$n.snv_mnv > 20 & !is.na(bb$time))
# d <- dist(bb$time[w])
# ci <- weighted.mean((bb$time.up - bb$time.lo)[w], width(bb)[w])
# h <- hclust(d, method='average', members=bb$n.snv_mnv[w])
# c <- cutree(h, h=ci)
# ww <- c==which.max(table(c))
# weighted.mean(bb$time[w][ww], 1/((bb$time.up - bb$time.lo + min.dist)[w][ww]), na.rm=TRUE)
s <- seq(0,1,0.01)
l2 <- pmin(bb$time.lo, bb$time - min.dist)[w]
u2 <- pmax(bb$time.up, bb$time + min.dist)[w]
l1 <- (l2 + bb$time[w])/2
u1 <- (u2+ bb$time[w])/2
wd <- as.numeric(width(bb)[w])
o <- sapply(s, function(i) sum(wd * ( (l2 <= i & u2 >=i) + (l1 <= i & u1 >= i))))
s[which.max(o)]
}
fractionGenomeWgdCompatible <- function(bb, min.dist=0.05){
m <- findMainCluster(bb)
l <- pmin(bb$time.lo, bb$time - min.dist)
u <- pmax(bb$time.up, bb$time + min.dist)
w <- which(l <= m & u >= m)
avgCi <- weighted.mean(bb$time.up- bb$time.lo, width(bb), na.rm=TRUE)
sd.wgd <- sqrt(weighted.mean((bb$time[w] - m)^2, width(bb)[w], na.rm=TRUE))
sd.all <- sqrt(weighted.mean((bb$time - m)^2, width(bb), na.rm=TRUE))
c(nt.wgd=sum(as.numeric(width(bb))[w]), nt.total=sum(as.numeric(width(bb))[!is.na(bb$time)]), time.wgd=m, n.wgd=length(w), n.all = sum(!is.na(bb$time)), chr.wgd = length(unique(seqnames(bb)[w])), chr.all = length(unique(seqnames(bb)[!is.na(bb$time)])), sd.wgd=sd.wgd, avg.ci=avgCi, sd.all=sd.all)
}
flattenBB <- function(bb){
u <- unique(bb)
d <- duplicated(bb)
mcols(u) <- mcols(u)[1:7]
u$total_cn_2 <- u$major_cn_2 <- u$minor_cn_2 <- as.integer(NA)
u$clonal_frequency_2 <- as.numeric(0)
if(any(d)){
s <- bb[d]
f <- findOverlaps(s, u, select='first')
mcols(u)[f, c("total_cn_2","major_cn_2","minor_cn_2","clonal_frequency_2")] <- mcols(s)[, c("total_cn","major_cn","minor_cn","clonal_frequency")]
}
u
}
reduceBB <- function(bb){
b <- split(bb, do.call("paste", mcols(bb)[c("clonal_frequency","major_cn","minor_cn")]))
r <- reduce(b)
s <- sort(unlist(r))
d <- DataFrame(t(sapply(strsplit(names(s), " "), as.numeric)))
names(d) <- c("clonal_frequency","major_cn","minor_cn")#names(mcols(bb))
mcols(s) <- d
names(s) <- NULL
u <- unique(bb)
f <- findOverlaps(s, u)
t <- table(subjectHits(f), queryHits(f))
s$n.snv_mnv <- u$n.snv_mnv %*% as.matrix(t)
s$total_cn <- s$major_cn + s$minor_cn
s$timing_param <- vector(mode="list", length=length(s))
s$timing_param[subjectHits(f)] <- u$timing_param[queryHits(f)]
return(s)
}
stackTime <- function(bb, time="time", t=seq(0,1,0.01)){
u <- unique(bb)
cols <- paste0(time,c("",".up",".lo"))
w <- as.numeric(width(u))
u <- mcols(u)
f <- function(x) pmin(pmax(x,0.01),0.99)
ut <- f((0.5*5+u[,cols[1]] * u$n.snv_mnv)/(5+u$n.snv_mnv))
uu <- f(u[,cols[2]])
ul <- f(u[,cols[3]])
diff(car::logit(f(t))) * rowSums(sapply(which(!is.na(ut)), function(i) w[i]*dnorm(car::logit(t[-1] - diff(t)/2), mean=car::logit(ut[i]), sd= (car::logit(uu[i]) - car::logit(ul[i]) + 0.05)/4)))#(t <= u$time.up[i] & t >= u$time.lo[i])))
#rowSums(sapply(which(!is.na(ut)), function(i) w[i]*(t <= u$time.up[i] & t >= u$time.lo[i])))
}
betaFromCi <- function(x, weight.mode=5){
if(any(is.na(x))) return(rep(NA,2))
f <- function(par,x) {
beta <- exp(par)
sum((qbeta(c(0.025,0.975), beta[1], beta[2])-x[-1])^2)+weight.mode*((beta[1]-1)/(beta[1]+beta[2]-2)-x[1])^2
}
tryCatch(exp(optim(c(0.1,0.1), fn=f,x=x)$par), error=c(1,1))
}
histBeta <- function(bb, time="time",n.min=10, s=seq(0.005,0.995,0.01)){
s <- pmax(0.001,pmin(0.999, s))
cols <- paste0(time,c("",".lo",".up"))
w <- which(bb$n.snv_mnv > n.min & !is.na(mcols(bb)[cols[1]]) & !duplicated(bb))
if(length(w)==0) return(rep(NA, length(s)))
d <- apply(as.matrix(mcols(bb)[w,c(cols, "n.snv_mnv")]), 1, function(x){
beta <- betaFromCi(x[1:3])
beta <- (beta * x[4] + 5*c(1,1))/(x[4]+5) # "posterior" with prior B(1,1)
dbeta(s,beta[1],beta[2])
})
wd <- as.numeric(width(bb)[w])
o <- d %*% wd
}
plotSample <- function(w, vcf = finalSnv[[w]], bb = finalBB[[w]], sv=finalSv[[w]], title=w, regions=refLengths[1:24], grid.bb='white',col.bb=c("lightgrey", "darkgrey"), ylim.bb=c(0,5), layout.height=c(4,1.2,3.5), y1=ylim.bb[2]-1) {
p <- par()
layout(matrix(1:3, ncol=1), height=layout.height)
par(mar=c(0.5,3,0.5,0.5), mgp=c(2,0.25,0), bty="L", las=2, tcl=-0.25, cex=1)
xlim=c(min(chrOffset[as.character(seqnames(regions))]+start(regions)),max(chrOffset[as.character(seqnames(regions))]+end(regions)))
bbb <- bb[bb %over% regions]
plotVcf(vcf[vcf %over% regions], bbb, finalClusters[[w]], title=FALSE, legend=FALSE, col.grid='white', xaxt=FALSE, cex=0.33, xlim=xlim)
mtext(line=-1, side=3, title, las=1)
tryCatch({
par(xpd=NA)
plotSv(sv, y1=y1, ylim=ylim.bb, regions=regions, add=FALSE, xaxt=FALSE)
par(xpd=FALSE)
}, error=function(x) warning(x))
plotBB(bbb, ylim=ylim.bb, legend=FALSE, type='bar', col.grid=grid.bb, col=col.bb, xaxt=FALSE, xlim=xlim, add=TRUE)
par(mar=c(3,3,0.5,0.5))
plotTiming(bbb, xlim=xlim, legend=FALSE, col.grid=NA)
if(length(regions) == 1)
axis(side=1, at=pretty(c(start(regions), end(regions)))+chrOffset[as.character(seqnames(regions))], labels=sitools::f2si(pretty(c(start(regions), end(regions)))), las=1)
if(any(!is.na(bb$time))){
y0 <- seq(0.005,0.995,0.01)
s <- histBeta(bb)
g <- colorRampPalette(RColorBrewer::brewer.pal(4,"Set1")[c(3,2,4)])(100)
segments(x0=chrOffset["MT"] ,y0=y0,x1=chrOffset["MT"] + s/max(s) * 1e8, col=g, lend=3)
getMode <- function(s){
if(all(is.na(s))) return(NA)
w <- which.max(s)
if(w %in% c(1, length(s))){
m <- which(c(0,diff(s))>0 & c(diff(s),0)<0)
if(length(m)==0) return(w)
m <- m[which.max(s[m])]
return(if(s[w] > 2*s[m]) w else m)
} else return(w)
}
abline(h=y0[getMode(s)], lty=5)
if("time.2nd" %in% colnames(mcols(bb))) if(any(!is.na(bb$time.2nd))){
s2 <- histBeta(bb, time="time.2nd")
segments(x0=chrOffset["MT"] + s/max(s) * 1e8 ,y0=y0,x1=chrOffset["MT"] + s/max(s) * 1e8 + s2/max(s) * 1e8, col=paste0(g,"44"), lend=3)
abline(h=y0[getMode(s2)], lty=3)
}
}
#print(w)
par(p[setdiff(names(p), c("cin","cra","csi","cxy","din","page"))])
}
plotSv <- function(sv, y0=0,y1=y0, h=1, col=paste0(RColorBrewer::brewer.pal(5,"Set1"),"44"), regions=refLengths[1:24], ylim=c(0,1), xaxt=TRUE,xlim=c(min(chrOffset[as.character(seqnames(regions))]+start(regions)),max(chrOffset[as.character(seqnames(regions))]+end(regions))), add=FALSE){
if(add==FALSE){
s <- c(1:22, "X","Y")
l <- as.numeric(width(refLengths[seqnames(refLengths) %in% s]))
names(l) <- s
plot(NA,NA, ylab="Copy number",xlab="",xlim=xlim, ylim=ylim, xaxt="n")
c <- cumsum(l)
#axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
if(length(regions) == 1)
if(xaxt) axis(side=1, at=pretty(c(start(regions), end(regions)))+chrOffset[as.character(seqnames(regions))], labels=sitools::f2si(pretty(c(start(regions), end(regions)))), las=1)
if(xaxt) mtext(side=1, at= cumsum(l) - l/2, text=names(l), line=1)
}
#r <- rowRanges(sv)
#a <- unlist(alt(sv))
vs <- GRanges(info(sv)$MATECHROM, IRanges(info(sv)$MATEPOS, width=1))
l <- 20
x0 <- seq(0,1,l=l)
y2 <- x0*(1-x0)*4*h
cls <- factor(as.character(info(sv)$SVCLASS), levels=c("DEL", "DUP", "h2hINV","t2tINV","TRA"))
w <- which(sv %over% regions | vs %over% regions)
for(i in w)
try({
x <- seq(start(sv)[i] + chrOffset[as.character(seqnames(sv)[i])], start(vs)[i] + chrOffset[as.character(seqnames(vs)[i])], length.out=l)
x <- c(x[1], x, x[length(x)])
y <- y1 + y2 * if(grepl("INV", cls[i])) -1 else 1
y <- c(y0, y , y0)
lines(x, y, col=col[cls[i]])
#segments(x0=c(x[1], x[l]), x1=c(x[1],x[l]), y0=y0, y1=y1, col=col[cls[i]])
})
}
t <- read.table("../ref/release_may2016.v1.1.TiN__donor.TiNsorted.20Jul2016.tsv", header=TRUE, sep="\t")
TiN <- t$TiN_donor
names(TiN) <- t$icgc_donor_id
w <- read.table("../final/structure_weme_released_consensus_merged.txt", header=TRUE)
wemeClustersMerged <- split(w[,1:4], w$sample)
cosineDist <- function(x,y){
t(x)%*%y/(sqrt(colSums(x^2) %*% t(colSums(y^2))) )
}
# This file contains the commands to time copy number gains using point mutations
# for a given cancer sample. It runs an earlier version of MutationTimeR and is
# invoked via the shell, passing on arguments about input and output file name.
#
# Author: Moritz Gerstung <moritz.gerstung@ebi.ac.uk>
###############################################################################
set.seed(42)
args <- commandArgs(trailingOnly = TRUE)
source("PCAWG-functions.R")
vcfFileIn <- args[1]
vcfFileOut <- args[2]
print(vcfFileIn)
print(vcfFileOut)
library(VariantAnnotation)
library(Matrix)
s <- strsplit(vcfFileIn,"/")[[1]]
ID <- sub("\\..+", "", s[length(s)])
print(ID)
#' ## CLUSTERS
# Load clusters
clusters <- consensusClustersToOld(loadConsensusClusters((ID))) # consensus clusters
clusters$proportion <- wccClusters[[ID]]$proportion # WCC adjusted CP
clusters$n_ssms <- wemeClustersMerged[[ID]]$n_ssms # WEME prevalence to fix consensus bug
purity <- purityPloidy[ID,'purity']
#' ## COPYNUMBER
bb <- loadConsensusCNA(ID, purity=purityPloidy[ID, 'purity'])
IS_WGD <- classWgd(bb)
#' ## VCF
#' Load vcf
vcf <- readVcf(vcfFileIn, genome="GRCh37") #, param=ScanVcfParam(which=pos))
# Add ID & gender
meta(header(vcf))$META <- rbind(meta(header(vcf))$META, DataFrame(Value=c(ID, as.character(allGender[ID, "pred_gender"])), row.names=c("ID", "gender")))
# Add driver genes
vcf <- addFinalDriver(vcf, finalDrivers)
# Add TNC
if(!"TNC" %in% rownames(info(header(vcf)))){
tnc=scanFa(file=refFile, resize(granges(vcf), 3,fix="center"))
i = info(header(vcf))
info(header(vcf)) <- rbind(i, DataFrame(Number=1,Type="String",Description="Trinucleotide context", row.names="TNC"))
info(vcf)$TNC <- as.character(tnc)
}
#' Add mutation copy numbers
# vcf <- addMutCn(vcf, bb, clusters)
i = info(header(vcf))
info(header(vcf)) <- rbind(i,mcnHeader())
L <- computeMutCn(vcf, bb, clusters=clusters, purity=purity, xmin=3, gender=as.character(allGender[ID, "pred_gender"]), isWgd=IS_WGD, n.boot=500)
info(vcf) <- cbind(info(vcf), L$D)
bb$timing_param <- L$P
#' Classify mutations
cls <- classifyMutations(vcf, reclassify='all')
info(vcf)$CLS <- cls
info(header(vcf)) <- rbind(info(header(vcf)), DataFrame(Number="1",Type="String",Description="Mutation classification: {clonal [early/late/NA], subclonal}", row.names="CLS"))
#' Timing
bb$n.snv_mnv <- countOverlaps(bb, vcf)
t <- bbToTime(bb)
mcols(bb) <- DataFrame(mcols(bb), t)
#' Drivers
info(vcf)$DG <- matchDrivers(vcf, finalDrivers)
#' Save output
writeVcf(vcf, file=vcfFileOut)
bgzip(vcfFileOut, overwrite=TRUE)
unlink(vcfFileOut)
save(vcf, file=paste0(vcfFileOut,".RData"))
#' ## INDEL
vcfIndelFileIn <- sub("20160830","20161006",gsub("snv_mnv","indel", vcfFileIn))
vcfIndelFileOut <- sub("20160830","20161006",gsub("snv_mnv","indel", vcfFileOut))
#' Load vcf
vcfIndel <- readVcf(vcfIndelFileIn, genome="GRCh37") #, param=ScanVcfParam(which=pos))
#' Add ID & gender
meta(header(vcfIndel))$META <- rbind(meta(header(vcfIndel))$META, DataFrame(Value=c(ID, as.character(allGender[ID, "pred_gender"])), row.names=c("ID", "gender")))
#' Add driver genes
vcfIndel <- addFinalDriver(vcfIndel, finalDrivers)
#' Add mutation copy numbers
i = info(header(vcfIndel))
info(header(vcfIndel)) <- rbind(i,mcnHeader())
L <- computeMutCn(vcfIndel, bb, clusters=clusters, purity=purity, xmin=3, gender=as.character(allGender[ID, "pred_gender"]), isWgd=IS_WGD)
info(vcfIndel) <- cbind(info(vcfIndel), L$D)
#' Classify mutation
clsIndel <- classifyMutations(vcfIndel, reclassify='all')
info(vcfIndel)$CLS <- clsIndel
info(header(vcfIndel)) <- rbind(info(header(vcfIndel)), DataFrame(Number="1",Type="String",Description="Mutation classification: {clonal [early/late/NA], subclonal}", row.names="CLS"))
#'Drivers
info(vcfIndel)$DG <- matchDrivers(vcfIndel, finalDrivers)
#' Save
writeVcf(vcfIndel, file=vcfIndelFileOut)
bgzip(vcfIndelFileOut, overwrite=TRUE)
unlink(vcfIndelFileOut)
save(vcfIndel, file=paste0(vcfIndelFileOut,".RData"))
#' Save BB
bb$n.indel <- countOverlaps(bb, vcfIndel)
seqlevels(bb) <- c(1:22, "X","Y")
bb <- sort(bb)
save(bb, file=sub("snv_mnv","cn",sub(".vcf$",".bb_granges.RData",vcfFileOut)))
#' Save clusters & purity
save(clusters, purity, file=sub("snv_mnv","clusters",sub(".vcf$",".clusters_purity.RData",vcfFileOut)))
#' ## PLOT
pdf(file=sub("snv_mnv","other",sub(".vcf$",".pdf",vcfFileOut)), 12,18)
par(mar=c(1,3,3,1), bty="L", mgp=c(2,.5,0), mfrow=c(4,1),cex=1, las=2)
j <- 1
for(v in c('vcf','vcfIndel')){
vv <- get(v)
col <- RColorBrewer::brewer.pal(9, "Set1")[c(3,4,2,1,9)]
plotVcf(vcf = vv, bb = bb, clusters = clusters, col = col, ID = ID, IS_WGD = IS_WGD, NO_CLUSTER = FALSE, title=j==1)
j <- j+1
}
plotBB(bb, ylim=c(0,10))
try(plotTiming(bb))
dev.off()
#plot(start(vcf) + w[as.character(seqnames(vcf))], qnorm(info(vcf)$pMutCNTail), col=col[cls], xlab='Position', ylab="pTail", pch=16)